diff --git a/eval_results/mistralai__Mixtral-8x7B-v0.1/results_2025-05-30T22-16-44.992858.json b/eval_results/mistralai__Mixtral-8x7B-v0.1/results_2025-05-30T22-16-44.992858.json new file mode 100644 index 0000000000000000000000000000000000000000..aa64373f38b744db0942310f4537fc76e550d7b4 --- /dev/null +++ b/eval_results/mistralai__Mixtral-8x7B-v0.1/results_2025-05-30T22-16-44.992858.json @@ -0,0 +1,4057 @@ +{ + "results": { + "openllm": { + " ": " ", + "alias": "Open LLM Leaderboard" + }, + "arc_challenge": { + "alias": " - arc_challenge", + "acc,none": 0.6373720136518771, + "acc_stderr,none": 0.014049106564955002, + "acc_norm,none": 0.6663822525597269, + "acc_norm_stderr,none": 0.01377868705417653 + }, + "gsm8k": { + "alias": " - gsm8k", + "exact_match,strict-match": 0.5905989385898408, + "exact_match_stderr,strict-match": 0.013544504071244514, + "exact_match,flexible-extract": 0.5943896891584534, + "exact_match_stderr,flexible-extract": 0.013524848894462115 + }, + "hellaswag": { + "alias": " - hellaswag", + "acc,none": 0.6708822943636725, + "acc_stderr,none": 0.004689324696186881, + "acc_norm,none": 0.861979685321649, + "acc_norm_stderr,none": 0.003442163843362882 + }, + "mmlu": { + "acc,none": 0.7053126335279875, + "acc_stderr,none": 0.003596517880218624, + "alias": " - mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6486716259298618, + "acc_stderr,none": 0.006479599568839706, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.5714285714285714, + "acc_stderr,none": 0.0442626668137991 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.8242424242424242, + "acc_stderr,none": 0.02972094300622445 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.8627450980392157, + "acc_stderr,none": 0.024152225962801584 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.8818565400843882, + "acc_stderr,none": 0.021011052659878463 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.8677685950413223, + "acc_stderr,none": 0.030922788320445795 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.8333333333333334, + "acc_stderr,none": 0.03602814176392645 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.7730061349693251, + "acc_stderr,none": 0.03291099578615771 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.7976878612716763, + "acc_stderr,none": 0.021628077380196124 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.40558659217877097, + "acc_stderr,none": 0.016421670506339168 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.7845659163987139, + "acc_stderr,none": 0.023350225475471442 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.8487654320987654, + "acc_stderr,none": 0.019935086092149876 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.5417209908735332, + "acc_stderr,none": 0.012725701656953642 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.8713450292397661, + "acc_stderr,none": 0.02567934272327694 + }, + "mmlu_other": { + "acc,none": 0.7740585774058577, + "acc_stderr,none": 0.007133218182046057, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.73, + "acc_stderr,none": 0.044619604333847394 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.7886792452830189, + "acc_stderr,none": 0.025125766484827845 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.7283236994219653, + "acc_stderr,none": 0.03391750322321659 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.46, + "acc_stderr,none": 0.05009082659620333 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.7847533632286996, + "acc_stderr,none": 0.027584066602208274 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.8737864077669902, + "acc_stderr,none": 0.03288180278808629 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.9102564102564102, + "acc_stderr,none": 0.018724301741941635 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.78, + "acc_stderr,none": 0.04163331998932261 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.8786717752234994, + "acc_stderr,none": 0.011675913883906723 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.8366013071895425, + "acc_stderr,none": 0.021170623011213512 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.5319148936170213, + "acc_stderr,none": 0.029766675075873866 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.7977941176470589, + "acc_stderr,none": 0.024398192986654924 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.5060240963855421, + "acc_stderr,none": 0.03892212195333045 + }, + "mmlu_social_sciences": { + "acc,none": 0.8121546961325967, + "acc_stderr,none": 0.006919714343596674, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.6578947368421053, + "acc_stderr,none": 0.04462917535336937 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.8636363636363636, + "acc_stderr,none": 0.024450155973189835 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.9378238341968912, + "acc_stderr,none": 0.017426974154240524 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.7128205128205128, + "acc_stderr,none": 0.022939925418530616 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.7941176470588235, + "acc_stderr,none": 0.02626502460827588 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.8825688073394495, + "acc_stderr,none": 0.01380278022737732 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.816793893129771, + "acc_stderr,none": 0.03392770926494733 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.7777777777777778, + "acc_stderr,none": 0.016819028375736386 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.7, + "acc_stderr,none": 0.04389311454644286 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.7836734693877551, + "acc_stderr,none": 0.026358916334904045 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.8905472636815921, + "acc_stderr,none": 0.022076326101824608 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.93, + "acc_stderr,none": 0.0256432399976243 + }, + "mmlu_stem": { + "acc,none": 0.6178242943228671, + "acc_stderr,none": 0.008249947196050048, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.38, + "acc_stderr,none": 0.04878317312145633 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.6814814814814815, + "acc_stderr,none": 0.04024778401977109 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.8223684210526315, + "acc_stderr,none": 0.031103182383123384 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.8541666666666666, + "acc_stderr,none": 0.029514245964291762 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.54, + "acc_stderr,none": 0.05009082659620333 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.58, + "acc_stderr,none": 0.049604496374885836 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.48, + "acc_stderr,none": 0.050211673156867795 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.4803921568627451, + "acc_stderr,none": 0.04971358884367406 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.81, + "acc_stderr,none": 0.03942772444036623 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.6680851063829787, + "acc_stderr,none": 0.030783736757745647 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.6689655172413793, + "acc_stderr,none": 0.039215453124671215 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.49206349206349204, + "acc_stderr,none": 0.02574806587167329 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.8419354838709677, + "acc_stderr,none": 0.020752831511875267 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.6403940886699507, + "acc_stderr,none": 0.03376458246509568 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.72, + "acc_stderr,none": 0.045126085985421276 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.37777777777777777, + "acc_stderr,none": 0.029560707392465715 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.46357615894039733, + "acc_stderr,none": 0.04071636065944217 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.6435185185185185, + "acc_stderr,none": 0.032664783315272714 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.5892857142857143, + "acc_stderr,none": 0.04669510663875191 + }, + "truthfulqa_gen": { + "alias": " - truthfulqa_gen", + "bleu_max,none": 27.911232508498426, + "bleu_max_stderr,none": 0.8106547087099264, + "bleu_acc,none": 0.44063647490820074, + "bleu_acc_stderr,none": 0.017379697555437446, + "bleu_diff,none": -2.7149338401323995, + "bleu_diff_stderr,none": 0.8105309556711907, + "rouge1_max,none": 53.53052750908425, + "rouge1_max_stderr,none": 0.8705222327844533, + "rouge1_acc,none": 0.43818849449204406, + "rouge1_acc_stderr,none": 0.017369236164404434, + "rouge1_diff,none": -3.8221294639026686, + "rouge1_diff_stderr,none": 0.9512456166796464, + "rouge2_max,none": 37.92580985775092, + "rouge2_max_stderr,none": 1.021406726801759, + "rouge2_acc,none": 0.3708690330477356, + "rouge2_acc_stderr,none": 0.016909693580248828, + "rouge2_diff,none": -5.092332229167073, + "rouge2_diff_stderr,none": 1.1070311425609114, + "rougeL_max,none": 50.93369782630966, + "rougeL_max_stderr,none": 0.8793511551021356, + "rougeL_acc,none": 0.42962056303549573, + "rougeL_acc_stderr,none": 0.0173292345804091, + "rougeL_diff,none": -3.9618623263413393, + "rougeL_diff_stderr,none": 0.9565565804334322 + }, + "truthfulqa_mc1": { + "alias": " - truthfulqa_mc1", + "acc,none": 0.34394124847001223, + "acc_stderr,none": 0.016629087514276775 + }, + "truthfulqa_mc2": { + "alias": " - truthfulqa_mc2", + "acc,none": 0.486118184178896, + "acc_stderr,none": 0.014546300990229502 + }, + "winogrande": { + "alias": " - winogrande", + "acc,none": 0.819258089976322, + "acc_stderr,none": 0.010814911009613971 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.7053126335279875, + "acc_stderr,none": 0.003596517880218624, + "alias": " - mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6486716259298618, + "acc_stderr,none": 0.006479599568839706, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.7740585774058577, + "acc_stderr,none": 0.007133218182046057, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.8121546961325967, + "acc_stderr,none": 0.006919714343596674, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.6178242943228671, + "acc_stderr,none": 0.008249947196050048, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_formal_logic", + "mmlu_high_school_world_history", + "mmlu_jurisprudence", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_international_law", + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_high_school_european_history", + "mmlu_high_school_us_history", + "mmlu_prehistory", + "mmlu_logical_fallacies", + "mmlu_philosophy" + ], + "mmlu_social_sciences": [ + "mmlu_public_relations", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_psychology", + "mmlu_sociology", + "mmlu_professional_psychology", + "mmlu_us_foreign_policy", + "mmlu_security_studies", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_human_sexuality", + "mmlu_high_school_microeconomics", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_nutrition", + "mmlu_human_aging", + "mmlu_management", + "mmlu_college_medicine", + "mmlu_professional_accounting", + "mmlu_miscellaneous", + "mmlu_business_ethics", + "mmlu_professional_medicine", + "mmlu_medical_genetics", + "mmlu_virology", + "mmlu_clinical_knowledge", + "mmlu_global_facts", + "mmlu_marketing" + ], + "mmlu_stem": [ + "mmlu_computer_security", + "mmlu_abstract_algebra", + "mmlu_high_school_computer_science", + "mmlu_machine_learning", + "mmlu_high_school_chemistry", + "mmlu_college_physics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_high_school_statistics", + "mmlu_college_computer_science", + "mmlu_high_school_biology", + "mmlu_high_school_physics", + "mmlu_college_chemistry", + "mmlu_college_mathematics", + "mmlu_college_biology", + "mmlu_high_school_mathematics", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_conceptual_physics" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ], + "openllm": [ + "arc_challenge", + "hellaswag", + "truthfulqa_gen", + "truthfulqa_mc2", + "truthfulqa_mc1", + "mmlu", + "winogrande", + "gsm8k" + ] + }, + "configs": { + "arc_challenge": { + "task": "arc_challenge", + "tag": [ + "ai2_arc" + ], + "dataset_path": "allenai/ai2_arc", + "dataset_name": "ARC-Challenge", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "fewshot_split": "train", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "unsafe_code": false, + "doc_to_choice": "{{choices.text}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 25, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "gsm8k": { + "task": "gsm8k", + "tag": [ + "math_word_problems" + ], + "dataset_path": "gsm8k", + "dataset_name": "main", + "training_split": "train", + "test_split": "test", + "fewshot_split": "train", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{answer}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": false, + "regexes_to_ignore": [ + ",", + "\\$", + "(?s).*#### ", + "\\.$" + ] + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "Question:", + "", + "<|im_end|>" + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "strict-match", + "filter": [ + { + "function": "regex", + "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" + }, + { + "function": "take_first" + } + ] + }, + { + "name": "flexible-extract", + "filter": [ + { + "function": "regex", + "group_select": -1, + "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": false, + "metadata": { + "version": 3.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "train", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_gen": { + "task": "truthfulqa_gen", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "generation", + "validation_split": "validation", + "process_docs": "def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:\n return dataset.map(preprocess_function)\n", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question}}", + "doc_to_target": " ", + "unsafe_code": false, + "process_results": "def process_results_gen(doc, results):\n completion = results[0]\n true_refs, false_refs = doc[\"correct_answers\"], doc[\"incorrect_answers\"]\n all_refs = true_refs + false_refs\n\n # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.\n\n # # BLEURT\n # bleurt_scores_true = self.bleurt.compute(\n # predictions=[completion] * len(true_refs), references=true_refs\n # )[\"scores\"]\n # bleurt_scores_false = self.bleurt.compute(\n # predictions=[completion] * len(false_refs), references=false_refs\n # )[\"scores\"]\n # bleurt_correct = max(bleurt_scores_true)\n # bleurt_incorrect = max(bleurt_scores_false)\n # bleurt_max = bleurt_correct\n # bleurt_diff = bleurt_correct - bleurt_incorrect\n # bleurt_acc = int(bleurt_correct > bleurt_incorrect)\n\n # BLEU\n bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs]\n bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])\n bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])\n bleu_max = bleu_correct\n bleu_diff = bleu_correct - bleu_incorrect\n bleu_acc = int(bleu_correct > bleu_incorrect)\n\n # ROUGE-N\n rouge_scores = [rouge([ref], [completion]) for ref in all_refs]\n # ROUGE-1\n rouge1_scores = [score[\"rouge1\"] for score in rouge_scores]\n rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])\n rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])\n rouge1_max = rouge1_correct\n rouge1_diff = rouge1_correct - rouge1_incorrect\n rouge1_acc = int(rouge1_correct > rouge1_incorrect)\n # ROUGE-2\n rouge2_scores = [score[\"rouge2\"] for score in rouge_scores]\n rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])\n rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])\n rouge2_max = rouge2_correct\n rouge2_diff = rouge2_correct - rouge2_incorrect\n rouge2_acc = int(rouge2_correct > rouge2_incorrect)\n # ROUGE-L\n rougeL_scores = [score[\"rougeLsum\"] for score in rouge_scores]\n rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])\n rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])\n rougeL_max = rougeL_correct\n rougeL_diff = rougeL_correct - rougeL_incorrect\n rougeL_acc = int(rougeL_correct > rougeL_incorrect)\n\n return {\n # \"bleurt_max\": bleurt_max,\n # \"bleurt_acc\": bleurt_acc,\n # \"bleurt_diff\": bleurt_diff,\n \"bleu_max\": bleu_max,\n \"bleu_acc\": bleu_acc,\n \"bleu_diff\": bleu_diff,\n \"rouge1_max\": rouge1_max,\n \"rouge1_acc\": rouge1_acc,\n \"rouge1_diff\": rouge1_diff,\n \"rouge2_max\": rouge2_max,\n \"rouge2_acc\": rouge2_acc,\n \"rouge2_diff\": rouge2_diff,\n \"rougeL_max\": rougeL_max,\n \"rougeL_acc\": rougeL_acc,\n \"rougeL_diff\": rougeL_diff,\n }\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "bleu_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_diff", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n\n" + ], + "do_sample": false, + "temperature": 0 + }, + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_mc1": { + "task": "truthfulqa_mc1", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "unsafe_code": false, + "doc_to_choice": "{{mc1_targets.choices}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 2.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_mc2": { + "task": "truthfulqa_mc2", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "unsafe_code": false, + "doc_to_choice": "{{mc2_targets.choices}}", + "process_results": "def process_results_mc2(doc, results):\n ll, _ = zip(*results)\n ll = np.array(ll)\n\n # Convert log-likelihoods to probabilities.\n probs = np.exp(ll)\n\n # Normalize probabilities.\n probs_norm = probs / np.sum(probs)\n\n labels = np.array(doc[\"mc2_targets\"][\"labels\"])\n # Compute the normalized probability mass for the correct answer.\n pm_true = np.sum(probs_norm[labels == 1])\n\n return {\"acc\": pm_true}\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "winogrande": { + "task": "winogrande", + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "train", + "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", + "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", + "unsafe_code": false, + "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "sentence", + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + } + }, + "versions": { + "arc_challenge": 1.0, + "gsm8k": 3.0, + "hellaswag": 1.0, + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0, + "truthfulqa_gen": 3.0, + "truthfulqa_mc1": 2.0, + "truthfulqa_mc2": 3.0, + "winogrande": 1.0 + }, + "n-shot": { + "arc_challenge": 25, + "gsm8k": 5, + "hellaswag": 10, + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5, + "truthfulqa_gen": 0, + "truthfulqa_mc1": 0, + "truthfulqa_mc2": 0, + "winogrande": 5 + }, + "higher_is_better": { + "arc_challenge": { + "acc": true, + "acc_norm": true + }, + "gsm8k": { + "exact_match": true + }, + "hellaswag": { + "acc": true, + "acc_norm": true + }, + "mmlu": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + }, + "openllm": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "truthfulqa_gen": { + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true + }, + "truthfulqa_mc1": { + "acc": true + }, + "truthfulqa_mc2": { + "acc": true + }, + "winogrande": { + "acc": true + } + }, + "n-samples": { + "arc_challenge": { + "original": 1172, + "effective": 1172 + }, + "hellaswag": { + "original": 10042, + "effective": 10042 + }, + "truthfulqa_gen": { + "original": 817, + "effective": 817 + }, + "truthfulqa_mc2": { + "original": 817, + "effective": 817 + }, + "truthfulqa_mc1": { + "original": 817, + "effective": 817 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "winogrande": { + "original": 1267, + "effective": 1267 + }, + "gsm8k": { + "original": 1319, + "effective": 1319 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=mistralai/Mixtral-8x7B-v0.1,attn_implementation=flash_attention_2", + "model_num_parameters": 46702792704, + "model_dtype": "torch.bfloat16", + "model_revision": "main", + "model_sha": "ffe1a706bacbd5abddc5ff99432ee38f7e0662fb", + "batch_size": "64", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "8bc4aff", + "date": 1748642396.9835355, + "pretty_env_info": "PyTorch version: 2.7.0a0+git6374332\nIs debug build: False\nCUDA used to build PyTorch: N/A\nROCM used to build PyTorch: 6.3.42134-a9a80e791\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 18.0.0git (https://github.com/RadeonOpenCompute/llvm-project roc-6.3.4 25012 e5bf7e55c91490b07c49d8960fa7983d864936c4)\nCMake version: version 3.31.6\nLibc version: glibc-2.35\n\nPython version: 3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-72-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: AMD Instinct MI300X (gfx942:sramecc+:xnack-)\nNvidia driver version: Could not collect\ncuDNN version: Could not collect\nHIP runtime version: 6.3.42134\nMIOpen runtime version: 3.3.0\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 52 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 9554 64-Core Processor\nCPU family: 25\nModel: 17\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3762.9880\nCPU min MHz: 1500.0000\nBogoMIPS: 6190.45\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid overflow_recov succor smca fsrm flush_l1d\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 128 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] mypy==1.9.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] optree==0.14.1\n[pip3] torch==2.7.0a0+git6374332\n[pip3] torchao==0.10.0.dev20250324+rocm6.3\n[pip3] torchdata==0.11.0\n[pip3] torchtune==0.0.0\n[pip3] torchvision==0.22.0a0+956025b\n[pip3] triton==3.2.0\n[conda] No relevant packages", + "transformers_version": "4.46.3", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "2" + ], + "tokenizer_bos_token": [ + "", + "1" + ], + "eot_token_id": 2, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "mistralai/Mixtral-8x7B-v0.1", + "model_name_sanitized": "mistralai__Mixtral-8x7B-v0.1", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 2911443.660103956, + "end_time": 2912453.494265957, + "total_evaluation_time_seconds": "1009.8341620010324" +} \ No newline at end of file diff --git a/eval_results/zephyr-8x7b-dpo-full/__models__zephyr-8x7b-dpo-full__/results_2025-06-03T19-43-19.625938.json b/eval_results/zephyr-8x7b-dpo-full/__models__zephyr-8x7b-dpo-full__/results_2025-06-03T19-43-19.625938.json new file mode 100644 index 0000000000000000000000000000000000000000..d6006ea44d8ddc5235f4763c4a6a5956cfb94806 --- /dev/null +++ b/eval_results/zephyr-8x7b-dpo-full/__models__zephyr-8x7b-dpo-full__/results_2025-06-03T19-43-19.625938.json @@ -0,0 +1,4057 @@ +{ + "results": { + "openllm": { + " ": " ", + "alias": "Open LLM Leaderboard" + }, + "arc_challenge": { + "alias": " - arc_challenge", + "acc,none": 0.6715017064846417, + "acc_stderr,none": 0.013724978465537297, + "acc_norm,none": 0.6911262798634812, + "acc_norm_stderr,none": 0.013501770929344003 + }, + "gsm8k": { + "alias": " - gsm8k", + "exact_match,strict-match": 0.27369219105382864, + "exact_match_stderr,strict-match": 0.012281003490963444, + "exact_match,flexible-extract": 0.6065200909780136, + "exact_match_stderr,flexible-extract": 0.013456315828404597 + }, + "hellaswag": { + "alias": " - hellaswag", + "acc,none": 0.6878111929894444, + "acc_stderr,none": 0.004624393690966898, + "acc_norm,none": 0.8355905198167696, + "acc_norm_stderr,none": 0.0036988923883800903 + }, + "mmlu": { + "acc,none": 0.6989745050562598, + "acc_stderr,none": 0.0036504800073218044, + "alias": " - mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6505844845908608, + "acc_stderr,none": 0.006566896822760607, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.5714285714285714, + "acc_stderr,none": 0.04426266681379909 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.8242424242424242, + "acc_stderr,none": 0.02972094300622445 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.8627450980392157, + "acc_stderr,none": 0.024152225962801584 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.8776371308016878, + "acc_stderr,none": 0.02133174182974679 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.8429752066115702, + "acc_stderr,none": 0.03321244842547129 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.8240740740740741, + "acc_stderr,none": 0.036809181416738807 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.7975460122699386, + "acc_stderr,none": 0.031570650789119 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.7572254335260116, + "acc_stderr,none": 0.023083658586984204 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.4782122905027933, + "acc_stderr,none": 0.016706617522176132 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.7813504823151125, + "acc_stderr,none": 0.02347558141786111 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.8209876543209876, + "acc_stderr,none": 0.021330868762127045 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.5215123859191656, + "acc_stderr,none": 0.012758410941038923 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.8713450292397661, + "acc_stderr,none": 0.02567934272327694 + }, + "mmlu_other": { + "acc,none": 0.7650466688123592, + "acc_stderr,none": 0.007268026183913096, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.71, + "acc_stderr,none": 0.045604802157206845 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.7962264150943397, + "acc_stderr,none": 0.02479078450177541 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.7167630057803468, + "acc_stderr,none": 0.034355680560478746 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.42, + "acc_stderr,none": 0.04960449637488584 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.7533632286995515, + "acc_stderr,none": 0.028930413120910877 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.8543689320388349, + "acc_stderr,none": 0.03492606476623791 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.9145299145299145, + "acc_stderr,none": 0.018315891685625838 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.8, + "acc_stderr,none": 0.040201512610368445 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.876117496807152, + "acc_stderr,none": 0.011781017100950742 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.7679738562091504, + "acc_stderr,none": 0.02417084087934087 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.5531914893617021, + "acc_stderr,none": 0.029658235097666907 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.7867647058823529, + "acc_stderr,none": 0.024880971512294264 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.5301204819277109, + "acc_stderr,none": 0.03885425420866766 + }, + "mmlu_social_sciences": { + "acc,none": 0.79688007799805, + "acc_stderr,none": 0.007123515080886825, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.6578947368421053, + "acc_stderr,none": 0.04462917535336936 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.8434343434343434, + "acc_stderr,none": 0.025890520358141454 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.9430051813471503, + "acc_stderr,none": 0.01673108529360755 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.7, + "acc_stderr,none": 0.02323458108842849 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.7815126050420168, + "acc_stderr,none": 0.026841514322958924 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.8899082568807339, + "acc_stderr,none": 0.01341993901868121 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.8015267175572519, + "acc_stderr,none": 0.03498149385462471 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.761437908496732, + "acc_stderr,none": 0.0172423858287796 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6818181818181818, + "acc_stderr,none": 0.04461272175910507 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.7346938775510204, + "acc_stderr,none": 0.028263889943784606 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.8507462686567164, + "acc_stderr,none": 0.02519692987482707 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.87, + "acc_stderr,none": 0.03379976689896308 + }, + "mmlu_stem": { + "acc,none": 0.6105296542974944, + "acc_stderr,none": 0.008286657765421425, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.41, + "acc_stderr,none": 0.049431107042371025 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.6962962962962963, + "acc_stderr,none": 0.03972552884785136 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.8026315789473685, + "acc_stderr,none": 0.03238981601699397 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.8194444444444444, + "acc_stderr,none": 0.032166008088022675 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.54, + "acc_stderr,none": 0.05009082659620333 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.65, + "acc_stderr,none": 0.047937248544110196 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.41, + "acc_stderr,none": 0.049431107042371025 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.49019607843137253, + "acc_stderr,none": 0.04974229460422817 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.83, + "acc_stderr,none": 0.0377525168068637 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.6553191489361702, + "acc_stderr,none": 0.03106898596312215 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.6137931034482759, + "acc_stderr,none": 0.04057324734419035 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.48412698412698413, + "acc_stderr,none": 0.025738330639412152 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.8419354838709677, + "acc_stderr,none": 0.020752831511875274 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.6157635467980296, + "acc_stderr,none": 0.0342239856565755 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.76, + "acc_stderr,none": 0.042923469599092816 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.36666666666666664, + "acc_stderr,none": 0.029381620726465076 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.5099337748344371, + "acc_stderr,none": 0.04081677107248437 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.6064814814814815, + "acc_stderr,none": 0.03331747876370312 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.5535714285714286, + "acc_stderr,none": 0.04718471485219588 + }, + "truthfulqa_gen": { + "alias": " - truthfulqa_gen", + "bleu_max,none": 12.262202796645438, + "bleu_max_stderr,none": 0.5917085117623292, + "bleu_acc,none": 0.4749082007343941, + "bleu_acc_stderr,none": 0.017481446804104003, + "bleu_diff,none": -0.04562064093923802, + "bleu_diff_stderr,none": 0.38638585351723603, + "rouge1_max,none": 32.89480201042699, + "rouge1_max_stderr,none": 0.7407414007140322, + "rouge1_acc,none": 0.4969400244798042, + "rouge1_acc_stderr,none": 0.01750317326096062, + "rouge1_diff,none": -0.7950355097239854, + "rouge1_diff_stderr,none": 0.6037276661763632, + "rouge2_max,none": 20.473793885067455, + "rouge2_max_stderr,none": 0.7365660518119835, + "rouge2_acc,none": 0.41615667074663404, + "rouge2_acc_stderr,none": 0.017255657502903043, + "rouge2_diff,none": -2.146463152691057, + "rouge2_diff_stderr,none": 0.7233024324987636, + "rougeL_max,none": 29.93288436324869, + "rougeL_max_stderr,none": 0.7322833363103851, + "rougeL_acc,none": 0.4847001223990208, + "rougeL_acc_stderr,none": 0.017495304473187902, + "rougeL_diff,none": -1.2536504918200089, + "rougeL_diff_stderr,none": 0.6028062689003527 + }, + "truthfulqa_mc1": { + "alias": " - truthfulqa_mc1", + "acc,none": 0.46878824969400246, + "acc_stderr,none": 0.01746936487457754 + }, + "truthfulqa_mc2": { + "alias": " - truthfulqa_mc2", + "acc,none": 0.6311218795509267, + "acc_stderr,none": 0.015724670826356663 + }, + "winogrande": { + "alias": " - winogrande", + "acc,none": 0.7387529597474349, + "acc_stderr,none": 0.012346914863415308 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.6989745050562598, + "acc_stderr,none": 0.0036504800073218044, + "alias": " - mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6505844845908608, + "acc_stderr,none": 0.006566896822760607, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.7650466688123592, + "acc_stderr,none": 0.007268026183913096, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.79688007799805, + "acc_stderr,none": 0.007123515080886825, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.6105296542974944, + "acc_stderr,none": 0.008286657765421425, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_formal_logic", + "mmlu_high_school_world_history", + "mmlu_jurisprudence", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_international_law", + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_high_school_european_history", + "mmlu_high_school_us_history", + "mmlu_prehistory", + "mmlu_logical_fallacies", + "mmlu_philosophy" + ], + "mmlu_social_sciences": [ + "mmlu_public_relations", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_psychology", + "mmlu_sociology", + "mmlu_professional_psychology", + "mmlu_us_foreign_policy", + "mmlu_security_studies", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_human_sexuality", + "mmlu_high_school_microeconomics", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_nutrition", + "mmlu_human_aging", + "mmlu_management", + "mmlu_college_medicine", + "mmlu_professional_accounting", + "mmlu_miscellaneous", + "mmlu_business_ethics", + "mmlu_professional_medicine", + "mmlu_medical_genetics", + "mmlu_virology", + "mmlu_clinical_knowledge", + "mmlu_global_facts", + "mmlu_marketing" + ], + "mmlu_stem": [ + "mmlu_computer_security", + "mmlu_abstract_algebra", + "mmlu_high_school_computer_science", + "mmlu_machine_learning", + "mmlu_high_school_chemistry", + "mmlu_college_physics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_high_school_statistics", + "mmlu_college_computer_science", + "mmlu_high_school_biology", + "mmlu_high_school_physics", + "mmlu_college_chemistry", + "mmlu_college_mathematics", + "mmlu_college_biology", + "mmlu_high_school_mathematics", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_conceptual_physics" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ], + "openllm": [ + "arc_challenge", + "hellaswag", + "truthfulqa_gen", + "truthfulqa_mc2", + "truthfulqa_mc1", + "mmlu", + "winogrande", + "gsm8k" + ] + }, + "configs": { + "arc_challenge": { + "task": "arc_challenge", + "tag": [ + "ai2_arc" + ], + "dataset_path": "allenai/ai2_arc", + "dataset_name": "ARC-Challenge", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "fewshot_split": "train", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "unsafe_code": false, + "doc_to_choice": "{{choices.text}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 25, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "gsm8k": { + "task": "gsm8k", + "tag": [ + "math_word_problems" + ], + "dataset_path": "gsm8k", + "dataset_name": "main", + "training_split": "train", + "test_split": "test", + "fewshot_split": "train", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{answer}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": false, + "regexes_to_ignore": [ + ",", + "\\$", + "(?s).*#### ", + "\\.$" + ] + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "Question:", + "", + "<|im_end|>" + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "strict-match", + "filter": [ + { + "function": "regex", + "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" + }, + { + "function": "take_first" + } + ] + }, + { + "name": "flexible-extract", + "filter": [ + { + "function": "regex", + "group_select": -1, + "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": false, + "metadata": { + "version": 3.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "train", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_gen": { + "task": "truthfulqa_gen", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "generation", + "validation_split": "validation", + "process_docs": "def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:\n return dataset.map(preprocess_function)\n", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question}}", + "doc_to_target": " ", + "unsafe_code": false, + "process_results": "def process_results_gen(doc, results):\n completion = results[0]\n true_refs, false_refs = doc[\"correct_answers\"], doc[\"incorrect_answers\"]\n all_refs = true_refs + false_refs\n\n # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.\n\n # # BLEURT\n # bleurt_scores_true = self.bleurt.compute(\n # predictions=[completion] * len(true_refs), references=true_refs\n # )[\"scores\"]\n # bleurt_scores_false = self.bleurt.compute(\n # predictions=[completion] * len(false_refs), references=false_refs\n # )[\"scores\"]\n # bleurt_correct = max(bleurt_scores_true)\n # bleurt_incorrect = max(bleurt_scores_false)\n # bleurt_max = bleurt_correct\n # bleurt_diff = bleurt_correct - bleurt_incorrect\n # bleurt_acc = int(bleurt_correct > bleurt_incorrect)\n\n # BLEU\n bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs]\n bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])\n bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])\n bleu_max = bleu_correct\n bleu_diff = bleu_correct - bleu_incorrect\n bleu_acc = int(bleu_correct > bleu_incorrect)\n\n # ROUGE-N\n rouge_scores = [rouge([ref], [completion]) for ref in all_refs]\n # ROUGE-1\n rouge1_scores = [score[\"rouge1\"] for score in rouge_scores]\n rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])\n rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])\n rouge1_max = rouge1_correct\n rouge1_diff = rouge1_correct - rouge1_incorrect\n rouge1_acc = int(rouge1_correct > rouge1_incorrect)\n # ROUGE-2\n rouge2_scores = [score[\"rouge2\"] for score in rouge_scores]\n rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])\n rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])\n rouge2_max = rouge2_correct\n rouge2_diff = rouge2_correct - rouge2_incorrect\n rouge2_acc = int(rouge2_correct > rouge2_incorrect)\n # ROUGE-L\n rougeL_scores = [score[\"rougeLsum\"] for score in rouge_scores]\n rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])\n rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])\n rougeL_max = rougeL_correct\n rougeL_diff = rougeL_correct - rougeL_incorrect\n rougeL_acc = int(rougeL_correct > rougeL_incorrect)\n\n return {\n # \"bleurt_max\": bleurt_max,\n # \"bleurt_acc\": bleurt_acc,\n # \"bleurt_diff\": bleurt_diff,\n \"bleu_max\": bleu_max,\n \"bleu_acc\": bleu_acc,\n \"bleu_diff\": bleu_diff,\n \"rouge1_max\": rouge1_max,\n \"rouge1_acc\": rouge1_acc,\n \"rouge1_diff\": rouge1_diff,\n \"rouge2_max\": rouge2_max,\n \"rouge2_acc\": rouge2_acc,\n \"rouge2_diff\": rouge2_diff,\n \"rougeL_max\": rougeL_max,\n \"rougeL_acc\": rougeL_acc,\n \"rougeL_diff\": rougeL_diff,\n }\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "bleu_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_diff", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n\n" + ], + "do_sample": false, + "temperature": 0 + }, + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_mc1": { + "task": "truthfulqa_mc1", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "unsafe_code": false, + "doc_to_choice": "{{mc1_targets.choices}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 2.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_mc2": { + "task": "truthfulqa_mc2", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "unsafe_code": false, + "doc_to_choice": "{{mc2_targets.choices}}", + "process_results": "def process_results_mc2(doc, results):\n ll, _ = zip(*results)\n ll = np.array(ll)\n\n # Convert log-likelihoods to probabilities.\n probs = np.exp(ll)\n\n # Normalize probabilities.\n probs_norm = probs / np.sum(probs)\n\n labels = np.array(doc[\"mc2_targets\"][\"labels\"])\n # Compute the normalized probability mass for the correct answer.\n pm_true = np.sum(probs_norm[labels == 1])\n\n return {\"acc\": pm_true}\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "winogrande": { + "task": "winogrande", + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "train", + "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", + "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", + "unsafe_code": false, + "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "sentence", + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + } + }, + "versions": { + "arc_challenge": 1.0, + "gsm8k": 3.0, + "hellaswag": 1.0, + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0, + "truthfulqa_gen": 3.0, + "truthfulqa_mc1": 2.0, + "truthfulqa_mc2": 3.0, + "winogrande": 1.0 + }, + "n-shot": { + "arc_challenge": 25, + "gsm8k": 5, + "hellaswag": 10, + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5, + "truthfulqa_gen": 0, + "truthfulqa_mc1": 0, + "truthfulqa_mc2": 0, + "winogrande": 5 + }, + "higher_is_better": { + "arc_challenge": { + "acc": true, + "acc_norm": true + }, + "gsm8k": { + "exact_match": true + }, + "hellaswag": { + "acc": true, + "acc_norm": true + }, + "mmlu": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + }, + "openllm": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "truthfulqa_gen": { + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true + }, + "truthfulqa_mc1": { + "acc": true + }, + "truthfulqa_mc2": { + "acc": true + }, + "winogrande": { + "acc": true + } + }, + "n-samples": { + "arc_challenge": { + "original": 1172, + "effective": 1172 + }, + "hellaswag": { + "original": 10042, + "effective": 10042 + }, + "truthfulqa_gen": { + "original": 817, + "effective": 817 + }, + "truthfulqa_mc2": { + "original": 817, + "effective": 817 + }, + "truthfulqa_mc1": { + "original": 817, + "effective": 817 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "winogrande": { + "original": 1267, + "effective": 1267 + }, + "gsm8k": { + "original": 1319, + "effective": 1319 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=/models/zephyr-8x7b-dpo-full/,attn_implementation=flash_attention_2", + "model_num_parameters": 46702792704, + "model_dtype": "torch.bfloat16", + "model_revision": "main", + "model_sha": "", + "batch_size": "64", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "8bc4aff", + "date": 1748978736.27075, + "pretty_env_info": "PyTorch version: 2.7.0a0+git6374332\nIs debug build: False\nCUDA used to build PyTorch: N/A\nROCM used to build PyTorch: 6.3.42134-a9a80e791\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 18.0.0git (https://github.com/RadeonOpenCompute/llvm-project roc-6.3.4 25012 e5bf7e55c91490b07c49d8960fa7983d864936c4)\nCMake version: version 3.31.6\nLibc version: glibc-2.35\n\nPython version: 3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-72-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: AMD Instinct MI300X (gfx942:sramecc+:xnack-)\nNvidia driver version: Could not collect\ncuDNN version: Could not collect\nHIP runtime version: 6.3.42134\nMIOpen runtime version: 3.3.0\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 52 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 9554 64-Core Processor\nCPU family: 25\nModel: 17\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3762.9880\nCPU min MHz: 1500.0000\nBogoMIPS: 6190.45\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid overflow_recov succor smca fsrm flush_l1d\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 128 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] mypy==1.9.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] optree==0.14.1\n[pip3] torch==2.7.0a0+git6374332\n[pip3] torchao==0.10.0.dev20250324+rocm6.3\n[pip3] torchdata==0.11.0\n[pip3] torchtune==0.0.0\n[pip3] torchvision==0.22.0a0+956025b\n[pip3] triton==3.2.0\n[conda] No relevant packages", + "transformers_version": "4.46.3", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "2" + ], + "tokenizer_eos_token": [ + "", + "2" + ], + "tokenizer_bos_token": [ + "", + "1" + ], + "eot_token_id": 2, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "/models/zephyr-8x7b-dpo-full/", + "model_name_sanitized": "__models__zephyr-8x7b-dpo-full__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}", + "chat_template_sha": "66291cf0045c2425a3a667cf3cbb7af2b11f09e025c02f97245323ab79119362", + "start_time": 3247782.952608405, + "end_time": 3248848.127254611, + "total_evaluation_time_seconds": "1065.1746462057345" +} \ No newline at end of file diff --git a/eval_results/zephyr-8x7b-sft-full/__models__zephyr-8x7b-sft-full__/results_2025-06-03T19-25-17.425921.json b/eval_results/zephyr-8x7b-sft-full/__models__zephyr-8x7b-sft-full__/results_2025-06-03T19-25-17.425921.json new file mode 100644 index 0000000000000000000000000000000000000000..bfdb30c00bb1c817b57be154ac2a2a1bf0b900c3 --- /dev/null +++ b/eval_results/zephyr-8x7b-sft-full/__models__zephyr-8x7b-sft-full__/results_2025-06-03T19-25-17.425921.json @@ -0,0 +1,4057 @@ +{ + "results": { + "openllm": { + " ": " ", + "alias": "Open LLM Leaderboard" + }, + "arc_challenge": { + "alias": " - arc_challenge", + "acc,none": 0.6382252559726962, + "acc_stderr,none": 0.014041957945038082, + "acc_norm,none": 0.6638225255972696, + "acc_norm_stderr,none": 0.013804855026205766 + }, + "gsm8k": { + "alias": " - gsm8k", + "exact_match,strict-match": 0.2934040940106141, + "exact_match_stderr,strict-match": 0.012541830815461487, + "exact_match,flexible-extract": 0.6141015921152388, + "exact_match_stderr,flexible-extract": 0.013409077471319187 + }, + "hellaswag": { + "alias": " - hellaswag", + "acc,none": 0.6608245369448317, + "acc_stderr,none": 0.004724619193427587, + "acc_norm,none": 0.8473411670981876, + "acc_norm_stderr,none": 0.003589232889306533 + }, + "mmlu": { + "acc,none": 0.6878649764990742, + "acc_stderr,none": 0.0036670762061670184, + "alias": " - mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6384697130712008, + "acc_stderr,none": 0.00655297777244263, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.5634920634920635, + "acc_stderr,none": 0.04435932892851466 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.8181818181818182, + "acc_stderr,none": 0.030117688929503582 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.8578431372549019, + "acc_stderr,none": 0.024509803921568627 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.8523206751054853, + "acc_stderr,none": 0.023094329582595694 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.859504132231405, + "acc_stderr,none": 0.031722334260021585 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.8148148148148148, + "acc_stderr,none": 0.03755265865037183 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.7852760736196319, + "acc_stderr,none": 0.03226219377286774 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.791907514450867, + "acc_stderr,none": 0.021855255263421802 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.41899441340782123, + "acc_stderr,none": 0.01650157930686168 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.7942122186495176, + "acc_stderr,none": 0.02296133990676424 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.8055555555555556, + "acc_stderr,none": 0.02202136610022021 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.5189048239895697, + "acc_stderr,none": 0.012761104871472653 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.8654970760233918, + "acc_stderr,none": 0.026168221344662294 + }, + "mmlu_other": { + "acc,none": 0.7605407145156099, + "acc_stderr,none": 0.0073324449114317135, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.72, + "acc_stderr,none": 0.045126085985421276 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.7924528301886793, + "acc_stderr,none": 0.02495991802891127 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.7109826589595376, + "acc_stderr,none": 0.034564257450869995 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.47, + "acc_stderr,none": 0.050161355804659205 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.7488789237668162, + "acc_stderr,none": 0.029105220833224605 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.8446601941747572, + "acc_stderr,none": 0.03586594738573974 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.9273504273504274, + "acc_stderr,none": 0.01700436856813234 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.76, + "acc_stderr,none": 0.04292346959909284 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.8722860791826309, + "acc_stderr,none": 0.011935626313999876 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.7647058823529411, + "acc_stderr,none": 0.024288619466046105 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.5354609929078015, + "acc_stderr,none": 0.029752389657427054 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.7610294117647058, + "acc_stderr,none": 0.025905280644893006 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.536144578313253, + "acc_stderr,none": 0.03882310850890593 + }, + "mmlu_social_sciences": { + "acc,none": 0.7894052648683783, + "acc_stderr,none": 0.007192477786598503, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.5789473684210527, + "acc_stderr,none": 0.046446020912223177 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.8585858585858586, + "acc_stderr,none": 0.024825909793343346 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.9326424870466321, + "acc_stderr,none": 0.018088393839078898 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.6974358974358974, + "acc_stderr,none": 0.023290888053772725 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.7563025210084033, + "acc_stderr,none": 0.027886828078380548 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.8825688073394495, + "acc_stderr,none": 0.013802780227377336 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.7862595419847328, + "acc_stderr,none": 0.0359546161177469 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.7565359477124183, + "acc_stderr,none": 0.017362473762146634 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6727272727272727, + "acc_stderr,none": 0.04494290866252088 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.7224489795918367, + "acc_stderr,none": 0.02866685779027465 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.8706467661691543, + "acc_stderr,none": 0.023729830881018512 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.88, + "acc_stderr,none": 0.03265986323710906 + }, + "mmlu_stem": { + "acc,none": 0.5908658420551856, + "acc_stderr,none": 0.008344632804579886, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.4, + "acc_stderr,none": 0.04923659639173309 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.7185185185185186, + "acc_stderr,none": 0.038850042458002526 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.7828947368421053, + "acc_stderr,none": 0.033550453048829226 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.8194444444444444, + "acc_stderr,none": 0.032166008088022675 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.55, + "acc_stderr,none": 0.05 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.62, + "acc_stderr,none": 0.04878317312145632 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.4, + "acc_stderr,none": 0.049236596391733084 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.4117647058823529, + "acc_stderr,none": 0.048971049527263666 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.8, + "acc_stderr,none": 0.04020151261036846 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.6382978723404256, + "acc_stderr,none": 0.03141082197596241 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.6068965517241379, + "acc_stderr,none": 0.040703290137070705 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.4417989417989418, + "acc_stderr,none": 0.02557625706125384 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.8193548387096774, + "acc_stderr,none": 0.02188617856717254 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.6059113300492611, + "acc_stderr,none": 0.034381579670365446 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.71, + "acc_stderr,none": 0.04560480215720684 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.35185185185185186, + "acc_stderr,none": 0.02911661760608302 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.5099337748344371, + "acc_stderr,none": 0.04081677107248437 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.5879629629629629, + "acc_stderr,none": 0.03356787758160831 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.5178571428571429, + "acc_stderr,none": 0.047427623612430116 + }, + "truthfulqa_gen": { + "alias": " - truthfulqa_gen", + "bleu_max,none": 18.36640586025778, + "bleu_max_stderr,none": 0.8674304415558707, + "bleu_acc,none": 0.49326805385556916, + "bleu_acc_stderr,none": 0.017501914492655386, + "bleu_diff,none": 0.88237417080354, + "bleu_diff_stderr,none": 0.7249901691483172, + "rouge1_max,none": 39.805324800545975, + "rouge1_max_stderr,none": 0.8813982881064194, + "rouge1_acc,none": 0.5006119951040392, + "rouge1_acc_stderr,none": 0.01750348793889251, + "rouge1_diff,none": -0.45883150173302645, + "rouge1_diff_stderr,none": 0.8335402032639051, + "rouge2_max,none": 25.58002993344627, + "rouge2_max_stderr,none": 0.976818886695092, + "rouge2_acc,none": 0.35495716034271724, + "rouge2_acc_stderr,none": 0.016750862381375898, + "rouge2_diff,none": -2.1154080028169173, + "rouge2_diff_stderr,none": 0.9666599627747504, + "rougeL_max,none": 36.91645064234783, + "rougeL_max_stderr,none": 0.8814329878464763, + "rougeL_acc,none": 0.47980416156670747, + "rougeL_acc_stderr,none": 0.01748921684973705, + "rougeL_diff,none": -0.7260643386778234, + "rougeL_diff_stderr,none": 0.8348369023354345 + }, + "truthfulqa_mc1": { + "alias": " - truthfulqa_mc1", + "acc,none": 0.35495716034271724, + "acc_stderr,none": 0.01675086238137591 + }, + "truthfulqa_mc2": { + "alias": " - truthfulqa_mc2", + "acc,none": 0.5040250807225598, + "acc_stderr,none": 0.014948995713444496 + }, + "winogrande": { + "alias": " - winogrande", + "acc,none": 0.8011049723756906, + "acc_stderr,none": 0.011218629972515319 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.6878649764990742, + "acc_stderr,none": 0.0036670762061670184, + "alias": " - mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6384697130712008, + "acc_stderr,none": 0.00655297777244263, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.7605407145156099, + "acc_stderr,none": 0.0073324449114317135, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.7894052648683783, + "acc_stderr,none": 0.007192477786598503, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.5908658420551856, + "acc_stderr,none": 0.008344632804579886, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_formal_logic", + "mmlu_high_school_world_history", + "mmlu_jurisprudence", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_international_law", + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_high_school_european_history", + "mmlu_high_school_us_history", + "mmlu_prehistory", + "mmlu_logical_fallacies", + "mmlu_philosophy" + ], + "mmlu_social_sciences": [ + "mmlu_public_relations", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_psychology", + "mmlu_sociology", + "mmlu_professional_psychology", + "mmlu_us_foreign_policy", + "mmlu_security_studies", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_human_sexuality", + "mmlu_high_school_microeconomics", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_nutrition", + "mmlu_human_aging", + "mmlu_management", + "mmlu_college_medicine", + "mmlu_professional_accounting", + "mmlu_miscellaneous", + "mmlu_business_ethics", + "mmlu_professional_medicine", + "mmlu_medical_genetics", + "mmlu_virology", + "mmlu_clinical_knowledge", + "mmlu_global_facts", + "mmlu_marketing" + ], + "mmlu_stem": [ + "mmlu_computer_security", + "mmlu_abstract_algebra", + "mmlu_high_school_computer_science", + "mmlu_machine_learning", + "mmlu_high_school_chemistry", + "mmlu_college_physics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_high_school_statistics", + "mmlu_college_computer_science", + "mmlu_high_school_biology", + "mmlu_high_school_physics", + "mmlu_college_chemistry", + "mmlu_college_mathematics", + "mmlu_college_biology", + "mmlu_high_school_mathematics", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_conceptual_physics" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ], + "openllm": [ + "arc_challenge", + "hellaswag", + "truthfulqa_gen", + "truthfulqa_mc2", + "truthfulqa_mc1", + "mmlu", + "winogrande", + "gsm8k" + ] + }, + "configs": { + "arc_challenge": { + "task": "arc_challenge", + "tag": [ + "ai2_arc" + ], + "dataset_path": "allenai/ai2_arc", + "dataset_name": "ARC-Challenge", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "fewshot_split": "train", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "unsafe_code": false, + "doc_to_choice": "{{choices.text}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 25, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "gsm8k": { + "task": "gsm8k", + "tag": [ + "math_word_problems" + ], + "dataset_path": "gsm8k", + "dataset_name": "main", + "training_split": "train", + "test_split": "test", + "fewshot_split": "train", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{answer}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": false, + "regexes_to_ignore": [ + ",", + "\\$", + "(?s).*#### ", + "\\.$" + ] + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "Question:", + "", + "<|im_end|>" + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "strict-match", + "filter": [ + { + "function": "regex", + "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" + }, + { + "function": "take_first" + } + ] + }, + { + "name": "flexible-extract", + "filter": [ + { + "function": "regex", + "group_select": -1, + "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": false, + "metadata": { + "version": 3.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "train", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_gen": { + "task": "truthfulqa_gen", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "generation", + "validation_split": "validation", + "process_docs": "def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:\n return dataset.map(preprocess_function)\n", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question}}", + "doc_to_target": " ", + "unsafe_code": false, + "process_results": "def process_results_gen(doc, results):\n completion = results[0]\n true_refs, false_refs = doc[\"correct_answers\"], doc[\"incorrect_answers\"]\n all_refs = true_refs + false_refs\n\n # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.\n\n # # BLEURT\n # bleurt_scores_true = self.bleurt.compute(\n # predictions=[completion] * len(true_refs), references=true_refs\n # )[\"scores\"]\n # bleurt_scores_false = self.bleurt.compute(\n # predictions=[completion] * len(false_refs), references=false_refs\n # )[\"scores\"]\n # bleurt_correct = max(bleurt_scores_true)\n # bleurt_incorrect = max(bleurt_scores_false)\n # bleurt_max = bleurt_correct\n # bleurt_diff = bleurt_correct - bleurt_incorrect\n # bleurt_acc = int(bleurt_correct > bleurt_incorrect)\n\n # BLEU\n bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs]\n bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])\n bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])\n bleu_max = bleu_correct\n bleu_diff = bleu_correct - bleu_incorrect\n bleu_acc = int(bleu_correct > bleu_incorrect)\n\n # ROUGE-N\n rouge_scores = [rouge([ref], [completion]) for ref in all_refs]\n # ROUGE-1\n rouge1_scores = [score[\"rouge1\"] for score in rouge_scores]\n rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])\n rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])\n rouge1_max = rouge1_correct\n rouge1_diff = rouge1_correct - rouge1_incorrect\n rouge1_acc = int(rouge1_correct > rouge1_incorrect)\n # ROUGE-2\n rouge2_scores = [score[\"rouge2\"] for score in rouge_scores]\n rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])\n rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])\n rouge2_max = rouge2_correct\n rouge2_diff = rouge2_correct - rouge2_incorrect\n rouge2_acc = int(rouge2_correct > rouge2_incorrect)\n # ROUGE-L\n rougeL_scores = [score[\"rougeLsum\"] for score in rouge_scores]\n rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])\n rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])\n rougeL_max = rougeL_correct\n rougeL_diff = rougeL_correct - rougeL_incorrect\n rougeL_acc = int(rougeL_correct > rougeL_incorrect)\n\n return {\n # \"bleurt_max\": bleurt_max,\n # \"bleurt_acc\": bleurt_acc,\n # \"bleurt_diff\": bleurt_diff,\n \"bleu_max\": bleu_max,\n \"bleu_acc\": bleu_acc,\n \"bleu_diff\": bleu_diff,\n \"rouge1_max\": rouge1_max,\n \"rouge1_acc\": rouge1_acc,\n \"rouge1_diff\": rouge1_diff,\n \"rouge2_max\": rouge2_max,\n \"rouge2_acc\": rouge2_acc,\n \"rouge2_diff\": rouge2_diff,\n \"rougeL_max\": rougeL_max,\n \"rougeL_acc\": rougeL_acc,\n \"rougeL_diff\": rougeL_diff,\n }\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "bleu_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_diff", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n\n" + ], + "do_sample": false, + "temperature": 0 + }, + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_mc1": { + "task": "truthfulqa_mc1", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "unsafe_code": false, + "doc_to_choice": "{{mc1_targets.choices}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 2.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_mc2": { + "task": "truthfulqa_mc2", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "unsafe_code": false, + "doc_to_choice": "{{mc2_targets.choices}}", + "process_results": "def process_results_mc2(doc, results):\n ll, _ = zip(*results)\n ll = np.array(ll)\n\n # Convert log-likelihoods to probabilities.\n probs = np.exp(ll)\n\n # Normalize probabilities.\n probs_norm = probs / np.sum(probs)\n\n labels = np.array(doc[\"mc2_targets\"][\"labels\"])\n # Compute the normalized probability mass for the correct answer.\n pm_true = np.sum(probs_norm[labels == 1])\n\n return {\"acc\": pm_true}\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "winogrande": { + "task": "winogrande", + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "train", + "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", + "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", + "unsafe_code": false, + "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "sentence", + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + } + }, + "versions": { + "arc_challenge": 1.0, + "gsm8k": 3.0, + "hellaswag": 1.0, + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0, + "truthfulqa_gen": 3.0, + "truthfulqa_mc1": 2.0, + "truthfulqa_mc2": 3.0, + "winogrande": 1.0 + }, + "n-shot": { + "arc_challenge": 25, + "gsm8k": 5, + "hellaswag": 10, + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5, + "truthfulqa_gen": 0, + "truthfulqa_mc1": 0, + "truthfulqa_mc2": 0, + "winogrande": 5 + }, + "higher_is_better": { + "arc_challenge": { + "acc": true, + "acc_norm": true + }, + "gsm8k": { + "exact_match": true + }, + "hellaswag": { + "acc": true, + "acc_norm": true + }, + "mmlu": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + }, + "openllm": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "truthfulqa_gen": { + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true + }, + "truthfulqa_mc1": { + "acc": true + }, + "truthfulqa_mc2": { + "acc": true + }, + "winogrande": { + "acc": true + } + }, + "n-samples": { + "arc_challenge": { + "original": 1172, + "effective": 1172 + }, + "hellaswag": { + "original": 10042, + "effective": 10042 + }, + "truthfulqa_gen": { + "original": 817, + "effective": 817 + }, + "truthfulqa_mc2": { + "original": 817, + "effective": 817 + }, + "truthfulqa_mc1": { + "original": 817, + "effective": 817 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "winogrande": { + "original": 1267, + "effective": 1267 + }, + "gsm8k": { + "original": 1319, + "effective": 1319 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=/models/zephyr-8x7b-sft-full/,attn_implementation=flash_attention_2", + "model_num_parameters": 46702792704, + "model_dtype": "torch.bfloat16", + "model_revision": "main", + "model_sha": "", + "batch_size": "64", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "8bc4aff", + "date": 1748977639.4302094, + "pretty_env_info": "PyTorch version: 2.7.0a0+git6374332\nIs debug build: False\nCUDA used to build PyTorch: N/A\nROCM used to build PyTorch: 6.3.42134-a9a80e791\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 18.0.0git (https://github.com/RadeonOpenCompute/llvm-project roc-6.3.4 25012 e5bf7e55c91490b07c49d8960fa7983d864936c4)\nCMake version: version 3.31.6\nLibc version: glibc-2.35\n\nPython version: 3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-72-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: AMD Instinct MI300X (gfx942:sramecc+:xnack-)\nNvidia driver version: Could not collect\ncuDNN version: Could not collect\nHIP runtime version: 6.3.42134\nMIOpen runtime version: 3.3.0\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 52 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 9554 64-Core Processor\nCPU family: 25\nModel: 17\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3762.9880\nCPU min MHz: 1500.0000\nBogoMIPS: 6190.45\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid overflow_recov succor smca fsrm flush_l1d\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 128 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] mypy==1.9.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] optree==0.14.1\n[pip3] torch==2.7.0a0+git6374332\n[pip3] torchao==0.10.0.dev20250324+rocm6.3\n[pip3] torchdata==0.11.0\n[pip3] torchtune==0.0.0\n[pip3] torchvision==0.22.0a0+956025b\n[pip3] triton==3.2.0\n[conda] No relevant packages", + "transformers_version": "4.46.3", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "2" + ], + "tokenizer_eos_token": [ + "", + "2" + ], + "tokenizer_bos_token": [ + "", + "1" + ], + "eot_token_id": 2, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "/models/zephyr-8x7b-sft-full/", + "model_name_sanitized": "__models__zephyr-8x7b-sft-full__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": true, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}", + "chat_template_sha": "66291cf0045c2425a3a667cf3cbb7af2b11f09e025c02f97245323ab79119362", + "start_time": 3246686.116936262, + "end_time": 3247765.927156867, + "total_evaluation_time_seconds": "1079.8102206047624" +} \ No newline at end of file diff --git a/old_eval_results/mistralai__Mixtral-8x7B-v0.1/results_2025-05-30T22-16-44.992858.json b/old_eval_results/mistralai__Mixtral-8x7B-v0.1/results_2025-05-30T22-16-44.992858.json new file mode 100644 index 0000000000000000000000000000000000000000..aa64373f38b744db0942310f4537fc76e550d7b4 --- /dev/null +++ b/old_eval_results/mistralai__Mixtral-8x7B-v0.1/results_2025-05-30T22-16-44.992858.json @@ -0,0 +1,4057 @@ +{ + "results": { + "openllm": { + " ": " ", + "alias": "Open LLM Leaderboard" + }, + "arc_challenge": { + "alias": " - arc_challenge", + "acc,none": 0.6373720136518771, + "acc_stderr,none": 0.014049106564955002, + "acc_norm,none": 0.6663822525597269, + "acc_norm_stderr,none": 0.01377868705417653 + }, + "gsm8k": { + "alias": " - gsm8k", + "exact_match,strict-match": 0.5905989385898408, + "exact_match_stderr,strict-match": 0.013544504071244514, + "exact_match,flexible-extract": 0.5943896891584534, + "exact_match_stderr,flexible-extract": 0.013524848894462115 + }, + "hellaswag": { + "alias": " - hellaswag", + "acc,none": 0.6708822943636725, + "acc_stderr,none": 0.004689324696186881, + "acc_norm,none": 0.861979685321649, + "acc_norm_stderr,none": 0.003442163843362882 + }, + "mmlu": { + "acc,none": 0.7053126335279875, + "acc_stderr,none": 0.003596517880218624, + "alias": " - mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6486716259298618, + "acc_stderr,none": 0.006479599568839706, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.5714285714285714, + "acc_stderr,none": 0.0442626668137991 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.8242424242424242, + "acc_stderr,none": 0.02972094300622445 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.8627450980392157, + "acc_stderr,none": 0.024152225962801584 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.8818565400843882, + "acc_stderr,none": 0.021011052659878463 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.8677685950413223, + "acc_stderr,none": 0.030922788320445795 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.8333333333333334, + "acc_stderr,none": 0.03602814176392645 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.7730061349693251, + "acc_stderr,none": 0.03291099578615771 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.7976878612716763, + "acc_stderr,none": 0.021628077380196124 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.40558659217877097, + "acc_stderr,none": 0.016421670506339168 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.7845659163987139, + "acc_stderr,none": 0.023350225475471442 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.8487654320987654, + "acc_stderr,none": 0.019935086092149876 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.5417209908735332, + "acc_stderr,none": 0.012725701656953642 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.8713450292397661, + "acc_stderr,none": 0.02567934272327694 + }, + "mmlu_other": { + "acc,none": 0.7740585774058577, + "acc_stderr,none": 0.007133218182046057, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.73, + "acc_stderr,none": 0.044619604333847394 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.7886792452830189, + "acc_stderr,none": 0.025125766484827845 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.7283236994219653, + "acc_stderr,none": 0.03391750322321659 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.46, + "acc_stderr,none": 0.05009082659620333 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.7847533632286996, + "acc_stderr,none": 0.027584066602208274 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.8737864077669902, + "acc_stderr,none": 0.03288180278808629 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.9102564102564102, + "acc_stderr,none": 0.018724301741941635 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.78, + "acc_stderr,none": 0.04163331998932261 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.8786717752234994, + "acc_stderr,none": 0.011675913883906723 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.8366013071895425, + "acc_stderr,none": 0.021170623011213512 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.5319148936170213, + "acc_stderr,none": 0.029766675075873866 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.7977941176470589, + "acc_stderr,none": 0.024398192986654924 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.5060240963855421, + "acc_stderr,none": 0.03892212195333045 + }, + "mmlu_social_sciences": { + "acc,none": 0.8121546961325967, + "acc_stderr,none": 0.006919714343596674, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.6578947368421053, + "acc_stderr,none": 0.04462917535336937 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.8636363636363636, + "acc_stderr,none": 0.024450155973189835 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.9378238341968912, + "acc_stderr,none": 0.017426974154240524 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.7128205128205128, + "acc_stderr,none": 0.022939925418530616 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.7941176470588235, + "acc_stderr,none": 0.02626502460827588 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.8825688073394495, + "acc_stderr,none": 0.01380278022737732 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.816793893129771, + "acc_stderr,none": 0.03392770926494733 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.7777777777777778, + "acc_stderr,none": 0.016819028375736386 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.7, + "acc_stderr,none": 0.04389311454644286 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.7836734693877551, + "acc_stderr,none": 0.026358916334904045 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.8905472636815921, + "acc_stderr,none": 0.022076326101824608 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.93, + "acc_stderr,none": 0.0256432399976243 + }, + "mmlu_stem": { + "acc,none": 0.6178242943228671, + "acc_stderr,none": 0.008249947196050048, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.38, + "acc_stderr,none": 0.04878317312145633 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.6814814814814815, + "acc_stderr,none": 0.04024778401977109 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.8223684210526315, + "acc_stderr,none": 0.031103182383123384 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.8541666666666666, + "acc_stderr,none": 0.029514245964291762 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.54, + "acc_stderr,none": 0.05009082659620333 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.58, + "acc_stderr,none": 0.049604496374885836 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.48, + "acc_stderr,none": 0.050211673156867795 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.4803921568627451, + "acc_stderr,none": 0.04971358884367406 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.81, + "acc_stderr,none": 0.03942772444036623 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.6680851063829787, + "acc_stderr,none": 0.030783736757745647 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.6689655172413793, + "acc_stderr,none": 0.039215453124671215 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.49206349206349204, + "acc_stderr,none": 0.02574806587167329 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.8419354838709677, + "acc_stderr,none": 0.020752831511875267 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.6403940886699507, + "acc_stderr,none": 0.03376458246509568 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.72, + "acc_stderr,none": 0.045126085985421276 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.37777777777777777, + "acc_stderr,none": 0.029560707392465715 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.46357615894039733, + "acc_stderr,none": 0.04071636065944217 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.6435185185185185, + "acc_stderr,none": 0.032664783315272714 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.5892857142857143, + "acc_stderr,none": 0.04669510663875191 + }, + "truthfulqa_gen": { + "alias": " - truthfulqa_gen", + "bleu_max,none": 27.911232508498426, + "bleu_max_stderr,none": 0.8106547087099264, + "bleu_acc,none": 0.44063647490820074, + "bleu_acc_stderr,none": 0.017379697555437446, + "bleu_diff,none": -2.7149338401323995, + "bleu_diff_stderr,none": 0.8105309556711907, + "rouge1_max,none": 53.53052750908425, + "rouge1_max_stderr,none": 0.8705222327844533, + "rouge1_acc,none": 0.43818849449204406, + "rouge1_acc_stderr,none": 0.017369236164404434, + "rouge1_diff,none": -3.8221294639026686, + "rouge1_diff_stderr,none": 0.9512456166796464, + "rouge2_max,none": 37.92580985775092, + "rouge2_max_stderr,none": 1.021406726801759, + "rouge2_acc,none": 0.3708690330477356, + "rouge2_acc_stderr,none": 0.016909693580248828, + "rouge2_diff,none": -5.092332229167073, + "rouge2_diff_stderr,none": 1.1070311425609114, + "rougeL_max,none": 50.93369782630966, + "rougeL_max_stderr,none": 0.8793511551021356, + "rougeL_acc,none": 0.42962056303549573, + "rougeL_acc_stderr,none": 0.0173292345804091, + "rougeL_diff,none": -3.9618623263413393, + "rougeL_diff_stderr,none": 0.9565565804334322 + }, + "truthfulqa_mc1": { + "alias": " - truthfulqa_mc1", + "acc,none": 0.34394124847001223, + "acc_stderr,none": 0.016629087514276775 + }, + "truthfulqa_mc2": { + "alias": " - truthfulqa_mc2", + "acc,none": 0.486118184178896, + "acc_stderr,none": 0.014546300990229502 + }, + "winogrande": { + "alias": " - winogrande", + "acc,none": 0.819258089976322, + "acc_stderr,none": 0.010814911009613971 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.7053126335279875, + "acc_stderr,none": 0.003596517880218624, + "alias": " - mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6486716259298618, + "acc_stderr,none": 0.006479599568839706, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.7740585774058577, + "acc_stderr,none": 0.007133218182046057, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.8121546961325967, + "acc_stderr,none": 0.006919714343596674, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.6178242943228671, + "acc_stderr,none": 0.008249947196050048, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_formal_logic", + "mmlu_high_school_world_history", + "mmlu_jurisprudence", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_international_law", + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_high_school_european_history", + "mmlu_high_school_us_history", + "mmlu_prehistory", + "mmlu_logical_fallacies", + "mmlu_philosophy" + ], + "mmlu_social_sciences": [ + "mmlu_public_relations", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_psychology", + "mmlu_sociology", + "mmlu_professional_psychology", + "mmlu_us_foreign_policy", + "mmlu_security_studies", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_human_sexuality", + "mmlu_high_school_microeconomics", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_nutrition", + "mmlu_human_aging", + "mmlu_management", + "mmlu_college_medicine", + "mmlu_professional_accounting", + "mmlu_miscellaneous", + "mmlu_business_ethics", + "mmlu_professional_medicine", + "mmlu_medical_genetics", + "mmlu_virology", + "mmlu_clinical_knowledge", + "mmlu_global_facts", + "mmlu_marketing" + ], + "mmlu_stem": [ + "mmlu_computer_security", + "mmlu_abstract_algebra", + "mmlu_high_school_computer_science", + "mmlu_machine_learning", + "mmlu_high_school_chemistry", + "mmlu_college_physics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_high_school_statistics", + "mmlu_college_computer_science", + "mmlu_high_school_biology", + "mmlu_high_school_physics", + "mmlu_college_chemistry", + "mmlu_college_mathematics", + "mmlu_college_biology", + "mmlu_high_school_mathematics", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_conceptual_physics" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ], + "openllm": [ + "arc_challenge", + "hellaswag", + "truthfulqa_gen", + "truthfulqa_mc2", + "truthfulqa_mc1", + "mmlu", + "winogrande", + "gsm8k" + ] + }, + "configs": { + "arc_challenge": { + "task": "arc_challenge", + "tag": [ + "ai2_arc" + ], + "dataset_path": "allenai/ai2_arc", + "dataset_name": "ARC-Challenge", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "fewshot_split": "train", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "unsafe_code": false, + "doc_to_choice": "{{choices.text}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 25, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "gsm8k": { + "task": "gsm8k", + "tag": [ + "math_word_problems" + ], + "dataset_path": "gsm8k", + "dataset_name": "main", + "training_split": "train", + "test_split": "test", + "fewshot_split": "train", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{answer}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": false, + "regexes_to_ignore": [ + ",", + "\\$", + "(?s).*#### ", + "\\.$" + ] + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "Question:", + "", + "<|im_end|>" + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "strict-match", + "filter": [ + { + "function": "regex", + "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" + }, + { + "function": "take_first" + } + ] + }, + { + "name": "flexible-extract", + "filter": [ + { + "function": "regex", + "group_select": -1, + "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": false, + "metadata": { + "version": 3.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "train", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_gen": { + "task": "truthfulqa_gen", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "generation", + "validation_split": "validation", + "process_docs": "def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:\n return dataset.map(preprocess_function)\n", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question}}", + "doc_to_target": " ", + "unsafe_code": false, + "process_results": "def process_results_gen(doc, results):\n completion = results[0]\n true_refs, false_refs = doc[\"correct_answers\"], doc[\"incorrect_answers\"]\n all_refs = true_refs + false_refs\n\n # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.\n\n # # BLEURT\n # bleurt_scores_true = self.bleurt.compute(\n # predictions=[completion] * len(true_refs), references=true_refs\n # )[\"scores\"]\n # bleurt_scores_false = self.bleurt.compute(\n # predictions=[completion] * len(false_refs), references=false_refs\n # )[\"scores\"]\n # bleurt_correct = max(bleurt_scores_true)\n # bleurt_incorrect = max(bleurt_scores_false)\n # bleurt_max = bleurt_correct\n # bleurt_diff = bleurt_correct - bleurt_incorrect\n # bleurt_acc = int(bleurt_correct > bleurt_incorrect)\n\n # BLEU\n bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs]\n bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])\n bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])\n bleu_max = bleu_correct\n bleu_diff = bleu_correct - bleu_incorrect\n bleu_acc = int(bleu_correct > bleu_incorrect)\n\n # ROUGE-N\n rouge_scores = [rouge([ref], [completion]) for ref in all_refs]\n # ROUGE-1\n rouge1_scores = [score[\"rouge1\"] for score in rouge_scores]\n rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])\n rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])\n rouge1_max = rouge1_correct\n rouge1_diff = rouge1_correct - rouge1_incorrect\n rouge1_acc = int(rouge1_correct > rouge1_incorrect)\n # ROUGE-2\n rouge2_scores = [score[\"rouge2\"] for score in rouge_scores]\n rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])\n rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])\n rouge2_max = rouge2_correct\n rouge2_diff = rouge2_correct - rouge2_incorrect\n rouge2_acc = int(rouge2_correct > rouge2_incorrect)\n # ROUGE-L\n rougeL_scores = [score[\"rougeLsum\"] for score in rouge_scores]\n rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])\n rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])\n rougeL_max = rougeL_correct\n rougeL_diff = rougeL_correct - rougeL_incorrect\n rougeL_acc = int(rougeL_correct > rougeL_incorrect)\n\n return {\n # \"bleurt_max\": bleurt_max,\n # \"bleurt_acc\": bleurt_acc,\n # \"bleurt_diff\": bleurt_diff,\n \"bleu_max\": bleu_max,\n \"bleu_acc\": bleu_acc,\n \"bleu_diff\": bleu_diff,\n \"rouge1_max\": rouge1_max,\n \"rouge1_acc\": rouge1_acc,\n \"rouge1_diff\": rouge1_diff,\n \"rouge2_max\": rouge2_max,\n \"rouge2_acc\": rouge2_acc,\n \"rouge2_diff\": rouge2_diff,\n \"rougeL_max\": rougeL_max,\n \"rougeL_acc\": rougeL_acc,\n \"rougeL_diff\": rougeL_diff,\n }\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "bleu_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_diff", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n\n" + ], + "do_sample": false, + "temperature": 0 + }, + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_mc1": { + "task": "truthfulqa_mc1", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "unsafe_code": false, + "doc_to_choice": "{{mc1_targets.choices}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 2.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_mc2": { + "task": "truthfulqa_mc2", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "unsafe_code": false, + "doc_to_choice": "{{mc2_targets.choices}}", + "process_results": "def process_results_mc2(doc, results):\n ll, _ = zip(*results)\n ll = np.array(ll)\n\n # Convert log-likelihoods to probabilities.\n probs = np.exp(ll)\n\n # Normalize probabilities.\n probs_norm = probs / np.sum(probs)\n\n labels = np.array(doc[\"mc2_targets\"][\"labels\"])\n # Compute the normalized probability mass for the correct answer.\n pm_true = np.sum(probs_norm[labels == 1])\n\n return {\"acc\": pm_true}\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + }, + "winogrande": { + "task": "winogrande", + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "train", + "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", + "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", + "unsafe_code": false, + "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "sentence", + "metadata": { + "version": 1.0, + "pretrained": "mistralai/Mixtral-8x7B-v0.1", + "attn_implementation": "flash_attention_2" + } + } + }, + "versions": { + "arc_challenge": 1.0, + "gsm8k": 3.0, + "hellaswag": 1.0, + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0, + "truthfulqa_gen": 3.0, + "truthfulqa_mc1": 2.0, + "truthfulqa_mc2": 3.0, + "winogrande": 1.0 + }, + "n-shot": { + "arc_challenge": 25, + "gsm8k": 5, + "hellaswag": 10, + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5, + "truthfulqa_gen": 0, + "truthfulqa_mc1": 0, + "truthfulqa_mc2": 0, + "winogrande": 5 + }, + "higher_is_better": { + "arc_challenge": { + "acc": true, + "acc_norm": true + }, + "gsm8k": { + "exact_match": true + }, + "hellaswag": { + "acc": true, + "acc_norm": true + }, + "mmlu": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + }, + "openllm": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "truthfulqa_gen": { + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true + }, + "truthfulqa_mc1": { + "acc": true + }, + "truthfulqa_mc2": { + "acc": true + }, + "winogrande": { + "acc": true + } + }, + "n-samples": { + "arc_challenge": { + "original": 1172, + "effective": 1172 + }, + "hellaswag": { + "original": 10042, + "effective": 10042 + }, + "truthfulqa_gen": { + "original": 817, + "effective": 817 + }, + "truthfulqa_mc2": { + "original": 817, + "effective": 817 + }, + "truthfulqa_mc1": { + "original": 817, + "effective": 817 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "winogrande": { + "original": 1267, + "effective": 1267 + }, + "gsm8k": { + "original": 1319, + "effective": 1319 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=mistralai/Mixtral-8x7B-v0.1,attn_implementation=flash_attention_2", + "model_num_parameters": 46702792704, + "model_dtype": "torch.bfloat16", + "model_revision": "main", + "model_sha": "ffe1a706bacbd5abddc5ff99432ee38f7e0662fb", + "batch_size": "64", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "8bc4aff", + "date": 1748642396.9835355, + "pretty_env_info": "PyTorch version: 2.7.0a0+git6374332\nIs debug build: False\nCUDA used to build PyTorch: N/A\nROCM used to build PyTorch: 6.3.42134-a9a80e791\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 18.0.0git (https://github.com/RadeonOpenCompute/llvm-project roc-6.3.4 25012 e5bf7e55c91490b07c49d8960fa7983d864936c4)\nCMake version: version 3.31.6\nLibc version: glibc-2.35\n\nPython version: 3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-72-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: AMD Instinct MI300X (gfx942:sramecc+:xnack-)\nNvidia driver version: Could not collect\ncuDNN version: Could not collect\nHIP runtime version: 6.3.42134\nMIOpen runtime version: 3.3.0\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 52 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 9554 64-Core Processor\nCPU family: 25\nModel: 17\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3762.9880\nCPU min MHz: 1500.0000\nBogoMIPS: 6190.45\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid overflow_recov succor smca fsrm flush_l1d\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 128 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] mypy==1.9.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] optree==0.14.1\n[pip3] torch==2.7.0a0+git6374332\n[pip3] torchao==0.10.0.dev20250324+rocm6.3\n[pip3] torchdata==0.11.0\n[pip3] torchtune==0.0.0\n[pip3] torchvision==0.22.0a0+956025b\n[pip3] triton==3.2.0\n[conda] No relevant packages", + "transformers_version": "4.46.3", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "0" + ], + "tokenizer_eos_token": [ + "", + "2" + ], + "tokenizer_bos_token": [ + "", + "1" + ], + "eot_token_id": 2, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "mistralai/Mixtral-8x7B-v0.1", + "model_name_sanitized": "mistralai__Mixtral-8x7B-v0.1", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 2911443.660103956, + "end_time": 2912453.494265957, + "total_evaluation_time_seconds": "1009.8341620010324" +} \ No newline at end of file diff --git a/old_eval_results/zephyr-8x7b-dpo-full/__models__zephyr-8x7b-dpo-full__/results_2025-05-30T22-59-20.947894.json b/old_eval_results/zephyr-8x7b-dpo-full/__models__zephyr-8x7b-dpo-full__/results_2025-05-30T22-59-20.947894.json new file mode 100644 index 0000000000000000000000000000000000000000..303bb0e471e117a8288c7408014f572d5aae0293 --- /dev/null +++ b/old_eval_results/zephyr-8x7b-dpo-full/__models__zephyr-8x7b-dpo-full__/results_2025-05-30T22-59-20.947894.json @@ -0,0 +1,4057 @@ +{ + "results": { + "openllm": { + " ": " ", + "alias": "Open LLM Leaderboard" + }, + "arc_challenge": { + "alias": " - arc_challenge", + "acc,none": 0.6604095563139932, + "acc_stderr,none": 0.013839039762820166, + "acc_norm,none": 0.6928327645051194, + "acc_norm_stderr,none": 0.013481034054980945 + }, + "gsm8k": { + "alias": " - gsm8k", + "exact_match,strict-match": 0.5860500379075056, + "exact_match_stderr,strict-match": 0.013566991960151781, + "exact_match,flexible-extract": 0.6527672479150872, + "exact_match_stderr,flexible-extract": 0.013113898382146874 + }, + "hellaswag": { + "alias": " - hellaswag", + "acc,none": 0.6917944632543318, + "acc_stderr,none": 0.004608082815535504, + "acc_norm,none": 0.8777136028679546, + "acc_norm_stderr,none": 0.0032694673590543183 + }, + "mmlu": { + "acc,none": 0.7063096425010682, + "acc_stderr,none": 0.0036319185670430237, + "alias": " - mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6624867162592987, + "acc_stderr,none": 0.006521778355281863, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.5476190476190477, + "acc_stderr,none": 0.044518079590553275 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.806060606060606, + "acc_stderr,none": 0.030874145136562094 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.8774509803921569, + "acc_stderr,none": 0.023015389732458265 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.8860759493670886, + "acc_stderr,none": 0.020681745135884555 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.8677685950413223, + "acc_stderr,none": 0.030922788320445795 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.8240740740740741, + "acc_stderr,none": 0.036809181416738807 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.754601226993865, + "acc_stderr,none": 0.03380939813943354 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.7803468208092486, + "acc_stderr,none": 0.022289638852617897 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.4770949720670391, + "acc_stderr,none": 0.016704945740326185 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.7942122186495176, + "acc_stderr,none": 0.022961339906764244 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.8333333333333334, + "acc_stderr,none": 0.020736358408060006 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.5508474576271186, + "acc_stderr,none": 0.012704030518851482 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.8771929824561403, + "acc_stderr,none": 0.025172984350155726 + }, + "mmlu_other": { + "acc,none": 0.7692307692307693, + "acc_stderr,none": 0.007252851524789787, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.67, + "acc_stderr,none": 0.04725815626252609 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.8037735849056604, + "acc_stderr,none": 0.024442388131100844 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.7398843930635838, + "acc_stderr,none": 0.03345036916788991 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.5, + "acc_stderr,none": 0.050251890762960605 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.7713004484304933, + "acc_stderr,none": 0.028188240046929203 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.8446601941747572, + "acc_stderr,none": 0.03586594738573973 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.9188034188034188, + "acc_stderr,none": 0.01789378490401854 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.76, + "acc_stderr,none": 0.04292346959909284 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.879948914431673, + "acc_stderr,none": 0.011622736692041249 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.7712418300653595, + "acc_stderr,none": 0.02405102973991226 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.5460992907801419, + "acc_stderr,none": 0.029700453247291474 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.7757352941176471, + "acc_stderr,none": 0.025336848563332372 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.5542168674698795, + "acc_stderr,none": 0.03869543323472101 + }, + "mmlu_social_sciences": { + "acc,none": 0.8040298992525187, + "acc_stderr,none": 0.007004484845344444, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.6403508771929824, + "acc_stderr,none": 0.04514496132873633 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.8434343434343434, + "acc_stderr,none": 0.025890520358141454 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.9533678756476683, + "acc_stderr,none": 0.015216761819262575 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.6794871794871795, + "acc_stderr,none": 0.023661296393964273 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.7983193277310925, + "acc_stderr,none": 0.026064313406304534 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.8935779816513761, + "acc_stderr,none": 0.013221554674594372 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.7938931297709924, + "acc_stderr,none": 0.03547771004159464 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.7696078431372549, + "acc_stderr,none": 0.01703522925803404 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6818181818181818, + "acc_stderr,none": 0.04461272175910507 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.7918367346938775, + "acc_stderr,none": 0.025991117672813296 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.8756218905472637, + "acc_stderr,none": 0.023335401790166327 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.88, + "acc_stderr,none": 0.032659863237109066 + }, + "mmlu_stem": { + "acc,none": 0.6143355534411672, + "acc_stderr,none": 0.008313907359872409, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.45, + "acc_stderr,none": 0.049999999999999996 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.6888888888888889, + "acc_stderr,none": 0.039992628766177214 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.7960526315789473, + "acc_stderr,none": 0.03279000406310052 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.8472222222222222, + "acc_stderr,none": 0.03008574324856568 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.58, + "acc_stderr,none": 0.049604496374885836 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.62, + "acc_stderr,none": 0.04878317312145632 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.45, + "acc_stderr,none": 0.04999999999999999 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.4803921568627451, + "acc_stderr,none": 0.04971358884367406 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.81, + "acc_stderr,none": 0.039427724440366234 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.6468085106382979, + "acc_stderr,none": 0.031245325202761926 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.5862068965517241, + "acc_stderr,none": 0.04104269211806232 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.5132275132275133, + "acc_stderr,none": 0.025742297289575142 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.8290322580645161, + "acc_stderr,none": 0.02141724293632156 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.6206896551724138, + "acc_stderr,none": 0.034139638059062345 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.71, + "acc_stderr,none": 0.045604802157206845 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.37407407407407406, + "acc_stderr,none": 0.02950286112895529 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.4503311258278146, + "acc_stderr,none": 0.04062290018683775 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.6203703703703703, + "acc_stderr,none": 0.03309682581119035 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.6517857142857143, + "acc_stderr,none": 0.045218299028335865 + }, + "truthfulqa_gen": { + "alias": " - truthfulqa_gen", + "bleu_max,none": 22.671661247272112, + "bleu_max_stderr,none": 0.7411961145801333, + "bleu_acc,none": 0.46266829865361075, + "bleu_acc_stderr,none": 0.017454645150970588, + "bleu_diff,none": -0.8512637475392046, + "bleu_diff_stderr,none": 0.6499471609876084, + "rouge1_max,none": 48.37854124696474, + "rouge1_max_stderr,none": 0.8403971643701553, + "rouge1_acc,none": 0.4675642594859241, + "rouge1_acc_stderr,none": 0.01746663214957761, + "rouge1_diff,none": -1.6920141660783337, + "rouge1_diff_stderr,none": 0.7798680674004778, + "rouge2_max,none": 32.32195271891199, + "rouge2_max_stderr,none": 0.9581726364088947, + "rouge2_acc,none": 0.386780905752754, + "rouge2_acc_stderr,none": 0.017048857010515103, + "rouge2_diff,none": -2.86907707894336, + "rouge2_diff_stderr,none": 0.9335707656440132, + "rougeL_max,none": 45.22765998420622, + "rougeL_max_stderr,none": 0.8402364245836251, + "rougeL_acc,none": 0.4565483476132191, + "rougeL_acc_stderr,none": 0.017437280953183695, + "rougeL_diff,none": -1.992769535802551, + "rougeL_diff_stderr,none": 0.7839032639078215 + }, + "truthfulqa_mc1": { + "alias": " - truthfulqa_mc1", + "acc,none": 0.408812729498164, + "acc_stderr,none": 0.017209952151641728 + }, + "truthfulqa_mc2": { + "alias": " - truthfulqa_mc2", + "acc,none": 0.5503356499756785, + "acc_stderr,none": 0.015488282798370166 + }, + "winogrande": { + "alias": " - winogrande", + "acc,none": 0.8255722178374112, + "acc_stderr,none": 0.010665187902498444 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.7063096425010682, + "acc_stderr,none": 0.0036319185670430237, + "alias": " - mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6624867162592987, + "acc_stderr,none": 0.006521778355281863, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.7692307692307693, + "acc_stderr,none": 0.007252851524789787, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.8040298992525187, + "acc_stderr,none": 0.007004484845344444, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.6143355534411672, + "acc_stderr,none": 0.008313907359872409, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_formal_logic", + "mmlu_high_school_world_history", + "mmlu_jurisprudence", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_international_law", + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_high_school_european_history", + "mmlu_high_school_us_history", + "mmlu_prehistory", + "mmlu_logical_fallacies", + "mmlu_philosophy" + ], + "mmlu_social_sciences": [ + "mmlu_public_relations", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_psychology", + "mmlu_sociology", + "mmlu_professional_psychology", + "mmlu_us_foreign_policy", + "mmlu_security_studies", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_human_sexuality", + "mmlu_high_school_microeconomics", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_nutrition", + "mmlu_human_aging", + "mmlu_management", + "mmlu_college_medicine", + "mmlu_professional_accounting", + "mmlu_miscellaneous", + "mmlu_business_ethics", + "mmlu_professional_medicine", + "mmlu_medical_genetics", + "mmlu_virology", + "mmlu_clinical_knowledge", + "mmlu_global_facts", + "mmlu_marketing" + ], + "mmlu_stem": [ + "mmlu_computer_security", + "mmlu_abstract_algebra", + "mmlu_high_school_computer_science", + "mmlu_machine_learning", + "mmlu_high_school_chemistry", + "mmlu_college_physics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_high_school_statistics", + "mmlu_college_computer_science", + "mmlu_high_school_biology", + "mmlu_high_school_physics", + "mmlu_college_chemistry", + "mmlu_college_mathematics", + "mmlu_college_biology", + "mmlu_high_school_mathematics", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_conceptual_physics" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ], + "openllm": [ + "arc_challenge", + "hellaswag", + "truthfulqa_gen", + "truthfulqa_mc2", + "truthfulqa_mc1", + "mmlu", + "winogrande", + "gsm8k" + ] + }, + "configs": { + "arc_challenge": { + "task": "arc_challenge", + "tag": [ + "ai2_arc" + ], + "dataset_path": "allenai/ai2_arc", + "dataset_name": "ARC-Challenge", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "fewshot_split": "train", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "unsafe_code": false, + "doc_to_choice": "{{choices.text}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 25, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "gsm8k": { + "task": "gsm8k", + "tag": [ + "math_word_problems" + ], + "dataset_path": "gsm8k", + "dataset_name": "main", + "training_split": "train", + "test_split": "test", + "fewshot_split": "train", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{answer}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": false, + "regexes_to_ignore": [ + ",", + "\\$", + "(?s).*#### ", + "\\.$" + ] + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "Question:", + "", + "<|im_end|>" + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "strict-match", + "filter": [ + { + "function": "regex", + "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" + }, + { + "function": "take_first" + } + ] + }, + { + "name": "flexible-extract", + "filter": [ + { + "function": "regex", + "group_select": -1, + "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": false, + "metadata": { + "version": 3.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "train", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_gen": { + "task": "truthfulqa_gen", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "generation", + "validation_split": "validation", + "process_docs": "def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:\n return dataset.map(preprocess_function)\n", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question}}", + "doc_to_target": " ", + "unsafe_code": false, + "process_results": "def process_results_gen(doc, results):\n completion = results[0]\n true_refs, false_refs = doc[\"correct_answers\"], doc[\"incorrect_answers\"]\n all_refs = true_refs + false_refs\n\n # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.\n\n # # BLEURT\n # bleurt_scores_true = self.bleurt.compute(\n # predictions=[completion] * len(true_refs), references=true_refs\n # )[\"scores\"]\n # bleurt_scores_false = self.bleurt.compute(\n # predictions=[completion] * len(false_refs), references=false_refs\n # )[\"scores\"]\n # bleurt_correct = max(bleurt_scores_true)\n # bleurt_incorrect = max(bleurt_scores_false)\n # bleurt_max = bleurt_correct\n # bleurt_diff = bleurt_correct - bleurt_incorrect\n # bleurt_acc = int(bleurt_correct > bleurt_incorrect)\n\n # BLEU\n bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs]\n bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])\n bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])\n bleu_max = bleu_correct\n bleu_diff = bleu_correct - bleu_incorrect\n bleu_acc = int(bleu_correct > bleu_incorrect)\n\n # ROUGE-N\n rouge_scores = [rouge([ref], [completion]) for ref in all_refs]\n # ROUGE-1\n rouge1_scores = [score[\"rouge1\"] for score in rouge_scores]\n rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])\n rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])\n rouge1_max = rouge1_correct\n rouge1_diff = rouge1_correct - rouge1_incorrect\n rouge1_acc = int(rouge1_correct > rouge1_incorrect)\n # ROUGE-2\n rouge2_scores = [score[\"rouge2\"] for score in rouge_scores]\n rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])\n rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])\n rouge2_max = rouge2_correct\n rouge2_diff = rouge2_correct - rouge2_incorrect\n rouge2_acc = int(rouge2_correct > rouge2_incorrect)\n # ROUGE-L\n rougeL_scores = [score[\"rougeLsum\"] for score in rouge_scores]\n rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])\n rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])\n rougeL_max = rougeL_correct\n rougeL_diff = rougeL_correct - rougeL_incorrect\n rougeL_acc = int(rougeL_correct > rougeL_incorrect)\n\n return {\n # \"bleurt_max\": bleurt_max,\n # \"bleurt_acc\": bleurt_acc,\n # \"bleurt_diff\": bleurt_diff,\n \"bleu_max\": bleu_max,\n \"bleu_acc\": bleu_acc,\n \"bleu_diff\": bleu_diff,\n \"rouge1_max\": rouge1_max,\n \"rouge1_acc\": rouge1_acc,\n \"rouge1_diff\": rouge1_diff,\n \"rouge2_max\": rouge2_max,\n \"rouge2_acc\": rouge2_acc,\n \"rouge2_diff\": rouge2_diff,\n \"rougeL_max\": rougeL_max,\n \"rougeL_acc\": rougeL_acc,\n \"rougeL_diff\": rougeL_diff,\n }\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "bleu_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_diff", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n\n" + ], + "do_sample": false, + "temperature": 0 + }, + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_mc1": { + "task": "truthfulqa_mc1", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "unsafe_code": false, + "doc_to_choice": "{{mc1_targets.choices}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 2.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_mc2": { + "task": "truthfulqa_mc2", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "unsafe_code": false, + "doc_to_choice": "{{mc2_targets.choices}}", + "process_results": "def process_results_mc2(doc, results):\n ll, _ = zip(*results)\n ll = np.array(ll)\n\n # Convert log-likelihoods to probabilities.\n probs = np.exp(ll)\n\n # Normalize probabilities.\n probs_norm = probs / np.sum(probs)\n\n labels = np.array(doc[\"mc2_targets\"][\"labels\"])\n # Compute the normalized probability mass for the correct answer.\n pm_true = np.sum(probs_norm[labels == 1])\n\n return {\"acc\": pm_true}\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + }, + "winogrande": { + "task": "winogrande", + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "train", + "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", + "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", + "unsafe_code": false, + "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "sentence", + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-dpo-full/", + "attn_implementation": "flash_attention_2" + } + } + }, + "versions": { + "arc_challenge": 1.0, + "gsm8k": 3.0, + "hellaswag": 1.0, + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0, + "truthfulqa_gen": 3.0, + "truthfulqa_mc1": 2.0, + "truthfulqa_mc2": 3.0, + "winogrande": 1.0 + }, + "n-shot": { + "arc_challenge": 25, + "gsm8k": 5, + "hellaswag": 10, + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5, + "truthfulqa_gen": 0, + "truthfulqa_mc1": 0, + "truthfulqa_mc2": 0, + "winogrande": 5 + }, + "higher_is_better": { + "arc_challenge": { + "acc": true, + "acc_norm": true + }, + "gsm8k": { + "exact_match": true + }, + "hellaswag": { + "acc": true, + "acc_norm": true + }, + "mmlu": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + }, + "openllm": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "truthfulqa_gen": { + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true + }, + "truthfulqa_mc1": { + "acc": true + }, + "truthfulqa_mc2": { + "acc": true + }, + "winogrande": { + "acc": true + } + }, + "n-samples": { + "arc_challenge": { + "original": 1172, + "effective": 1172 + }, + "hellaswag": { + "original": 10042, + "effective": 10042 + }, + "truthfulqa_gen": { + "original": 817, + "effective": 817 + }, + "truthfulqa_mc2": { + "original": 817, + "effective": 817 + }, + "truthfulqa_mc1": { + "original": 817, + "effective": 817 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "winogrande": { + "original": 1267, + "effective": 1267 + }, + "gsm8k": { + "original": 1319, + "effective": 1319 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=/models/zephyr-8x7b-dpo-full/,attn_implementation=flash_attention_2", + "model_num_parameters": 46702792704, + "model_dtype": "torch.bfloat16", + "model_revision": "main", + "model_sha": "", + "batch_size": "64", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "8bc4aff", + "date": 1748644898.7235322, + "pretty_env_info": "PyTorch version: 2.7.0a0+git6374332\nIs debug build: False\nCUDA used to build PyTorch: N/A\nROCM used to build PyTorch: 6.3.42134-a9a80e791\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 18.0.0git (https://github.com/RadeonOpenCompute/llvm-project roc-6.3.4 25012 e5bf7e55c91490b07c49d8960fa7983d864936c4)\nCMake version: version 3.31.6\nLibc version: glibc-2.35\n\nPython version: 3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-72-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: AMD Instinct MI300X (gfx942:sramecc+:xnack-)\nNvidia driver version: Could not collect\ncuDNN version: Could not collect\nHIP runtime version: 6.3.42134\nMIOpen runtime version: 3.3.0\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 52 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 9554 64-Core Processor\nCPU family: 25\nModel: 17\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3762.9880\nCPU min MHz: 1500.0000\nBogoMIPS: 6190.45\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid overflow_recov succor smca fsrm flush_l1d\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 128 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] mypy==1.9.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] optree==0.14.1\n[pip3] torch==2.7.0a0+git6374332\n[pip3] torchao==0.10.0.dev20250324+rocm6.3\n[pip3] torchdata==0.11.0\n[pip3] torchtune==0.0.0\n[pip3] torchvision==0.22.0a0+956025b\n[pip3] triton==3.2.0\n[conda] No relevant packages", + "transformers_version": "4.46.3", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "2" + ], + "tokenizer_eos_token": [ + "", + "2" + ], + "tokenizer_bos_token": [ + "", + "1" + ], + "eot_token_id": 2, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "/models/zephyr-8x7b-dpo-full/", + "model_name_sanitized": "__models__zephyr-8x7b-dpo-full__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 2913945.400664889, + "end_time": 2915009.449322183, + "total_evaluation_time_seconds": "1064.0486572943628" +} \ No newline at end of file diff --git a/old_eval_results/zephyr-8x7b-sft-full/__models__zephyr-8x7b-sft-full__/results_2025-05-30T22-41-20.839434.json b/old_eval_results/zephyr-8x7b-sft-full/__models__zephyr-8x7b-sft-full__/results_2025-05-30T22-41-20.839434.json new file mode 100644 index 0000000000000000000000000000000000000000..484e93b1a63c0ecb66fd3307295eb5b94fd96a26 --- /dev/null +++ b/old_eval_results/zephyr-8x7b-sft-full/__models__zephyr-8x7b-sft-full__/results_2025-05-30T22-41-20.839434.json @@ -0,0 +1,4057 @@ +{ + "results": { + "openllm": { + " ": " ", + "alias": "Open LLM Leaderboard" + }, + "arc_challenge": { + "alias": " - arc_challenge", + "acc,none": 0.6467576791808873, + "acc_stderr,none": 0.013967822714840053, + "acc_norm,none": 0.6680887372013652, + "acc_norm_stderr,none": 0.013760988200880534 + }, + "gsm8k": { + "alias": " - gsm8k", + "exact_match,strict-match": 0.3169067475360121, + "exact_match_stderr,strict-match": 0.012815868296721378, + "exact_match,flexible-extract": 0.6262319939347991, + "exact_match_stderr,flexible-extract": 0.01332634286073702 + }, + "hellaswag": { + "alias": " - hellaswag", + "acc,none": 0.6659032065325632, + "acc_stderr,none": 0.004707097816047541, + "acc_norm,none": 0.8607847042421828, + "acc_norm_stderr,none": 0.003454635760066271 + }, + "mmlu": { + "acc,none": 0.6959834781370176, + "acc_stderr,none": 0.0036598644759596893, + "alias": " - mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6512221041445271, + "acc_stderr,none": 0.006564739976555933, + "alias": " - humanities" + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.5634920634920635, + "acc_stderr,none": 0.04435932892851466 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.806060606060606, + "acc_stderr,none": 0.03087414513656209 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.8725490196078431, + "acc_stderr,none": 0.02340553048084631 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.8523206751054853, + "acc_stderr,none": 0.023094329582595694 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.8677685950413223, + "acc_stderr,none": 0.030922788320445795 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.8240740740740741, + "acc_stderr,none": 0.036809181416738807 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.7730061349693251, + "acc_stderr,none": 0.03291099578615771 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.7890173410404624, + "acc_stderr,none": 0.021966309947043114 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.4659217877094972, + "acc_stderr,none": 0.01668361583748686 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.7909967845659164, + "acc_stderr,none": 0.023093140398374224 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.8117283950617284, + "acc_stderr,none": 0.02175186606081585 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.530638852672751, + "acc_stderr,none": 0.012746237711716634 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.8596491228070176, + "acc_stderr,none": 0.0266405825391332 + }, + "mmlu_other": { + "acc,none": 0.7573221757322176, + "acc_stderr,none": 0.007341965227724632, + "alias": " - other" + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.65, + "acc_stderr,none": 0.0479372485441102 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.7886792452830189, + "acc_stderr,none": 0.025125766484827845 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.7167630057803468, + "acc_stderr,none": 0.034355680560478746 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.44, + "acc_stderr,none": 0.04988876515698589 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.7713004484304933, + "acc_stderr,none": 0.028188240046929203 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.8446601941747572, + "acc_stderr,none": 0.03586594738573973 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.9188034188034188, + "acc_stderr,none": 0.017893784904018536 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.78, + "acc_stderr,none": 0.04163331998932262 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.8710089399744572, + "acc_stderr,none": 0.011986371548086894 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.7679738562091504, + "acc_stderr,none": 0.02417084087934087 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.5141843971631206, + "acc_stderr,none": 0.02981549448368206 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.7573529411764706, + "acc_stderr,none": 0.02604066247420126 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.5481927710843374, + "acc_stderr,none": 0.038743715565879536 + }, + "mmlu_social_sciences": { + "acc,none": 0.7998050048748782, + "acc_stderr,none": 0.007074232053128125, + "alias": " - social sciences" + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.6228070175438597, + "acc_stderr,none": 0.045595221419582166 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.8434343434343434, + "acc_stderr,none": 0.025890520358141454 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.9533678756476683, + "acc_stderr,none": 0.015216761819262575 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.6948717948717948, + "acc_stderr,none": 0.023346335293325884 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.7983193277310925, + "acc_stderr,none": 0.02606431340630453 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.8788990825688073, + "acc_stderr,none": 0.013987618292389713 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.7862595419847328, + "acc_stderr,none": 0.0359546161177469 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.7647058823529411, + "acc_stderr,none": 0.017160587235046345 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6727272727272727, + "acc_stderr,none": 0.04494290866252088 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.7755102040816326, + "acc_stderr,none": 0.02671143055553841 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.8805970149253731, + "acc_stderr,none": 0.02292879327721974 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.87, + "acc_stderr,none": 0.03379976689896308 + }, + "mmlu_stem": { + "acc,none": 0.6010149064383127, + "acc_stderr,none": 0.008349386507391598, + "alias": " - stem" + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.42, + "acc_stderr,none": 0.049604496374885836 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.6888888888888889, + "acc_stderr,none": 0.039992628766177214 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.7960526315789473, + "acc_stderr,none": 0.03279000406310051 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.8263888888888888, + "acc_stderr,none": 0.03167473383795718 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.53, + "acc_stderr,none": 0.050161355804659205 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.6, + "acc_stderr,none": 0.049236596391733084 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.45, + "acc_stderr,none": 0.04999999999999999 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.4117647058823529, + "acc_stderr,none": 0.048971049527263666 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.78, + "acc_stderr,none": 0.04163331998932263 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.6382978723404256, + "acc_stderr,none": 0.03141082197596241 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.6137931034482759, + "acc_stderr,none": 0.04057324734419035 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.48677248677248675, + "acc_stderr,none": 0.025742297289575142 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.8129032258064516, + "acc_stderr,none": 0.022185710092252252 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.6108374384236454, + "acc_stderr,none": 0.034304624161038716 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.7, + "acc_stderr,none": 0.046056618647183814 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.37407407407407406, + "acc_stderr,none": 0.02950286112895529 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.4105960264900662, + "acc_stderr,none": 0.04016689594849929 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.6296296296296297, + "acc_stderr,none": 0.03293377139415191 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.6607142857142857, + "acc_stderr,none": 0.0449394906861354 + }, + "truthfulqa_gen": { + "alias": " - truthfulqa_gen", + "bleu_max,none": 27.796640840332962, + "bleu_max_stderr,none": 0.8208435861315281, + "bleu_acc,none": 0.4504283965728274, + "bleu_acc_stderr,none": 0.017417264371967642, + "bleu_diff,none": -0.7517027393564264, + "bleu_diff_stderr,none": 0.8649851713110286, + "rouge1_max,none": 53.30905777130096, + "rouge1_max_stderr,none": 0.8914745410824457, + "rouge1_acc,none": 0.44430844553243576, + "rouge1_acc_stderr,none": 0.017394586250743173, + "rouge1_diff,none": -0.9323778612826079, + "rouge1_diff_stderr,none": 1.0470694276603842, + "rouge2_max,none": 37.943965865840156, + "rouge2_max_stderr,none": 1.057007827259565, + "rouge2_acc,none": 0.37454100367197063, + "rouge2_acc_stderr,none": 0.01694353512840532, + "rouge2_diff,none": -1.984947923543481, + "rouge2_diff_stderr,none": 1.2017239249485427, + "rougeL_max,none": 50.65364107690044, + "rougeL_max_stderr,none": 0.9008845891078575, + "rougeL_acc,none": 0.4394124847001224, + "rougeL_acc_stderr,none": 0.017374520482513707, + "rougeL_diff,none": -1.1710723426547962, + "rougeL_diff_stderr,none": 1.0559674189670347 + }, + "truthfulqa_mc1": { + "alias": " - truthfulqa_mc1", + "acc,none": 0.34394124847001223, + "acc_stderr,none": 0.01662908751427678 + }, + "truthfulqa_mc2": { + "alias": " - truthfulqa_mc2", + "acc,none": 0.48498720864914324, + "acc_stderr,none": 0.01471245269584186 + }, + "winogrande": { + "alias": " - winogrande", + "acc,none": 0.819258089976322, + "acc_stderr,none": 0.010814911009613971 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.6959834781370176, + "acc_stderr,none": 0.0036598644759596893, + "alias": " - mmlu" + }, + "mmlu_humanities": { + "acc,none": 0.6512221041445271, + "acc_stderr,none": 0.006564739976555933, + "alias": " - humanities" + }, + "mmlu_other": { + "acc,none": 0.7573221757322176, + "acc_stderr,none": 0.007341965227724632, + "alias": " - other" + }, + "mmlu_social_sciences": { + "acc,none": 0.7998050048748782, + "acc_stderr,none": 0.007074232053128125, + "alias": " - social sciences" + }, + "mmlu_stem": { + "acc,none": 0.6010149064383127, + "acc_stderr,none": 0.008349386507391598, + "alias": " - stem" + } + }, + "group_subtasks": { + "mmlu_humanities": [ + "mmlu_formal_logic", + "mmlu_high_school_world_history", + "mmlu_jurisprudence", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_international_law", + "mmlu_world_religions", + "mmlu_professional_law", + "mmlu_high_school_european_history", + "mmlu_high_school_us_history", + "mmlu_prehistory", + "mmlu_logical_fallacies", + "mmlu_philosophy" + ], + "mmlu_social_sciences": [ + "mmlu_public_relations", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_psychology", + "mmlu_sociology", + "mmlu_professional_psychology", + "mmlu_us_foreign_policy", + "mmlu_security_studies", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_human_sexuality", + "mmlu_high_school_microeconomics", + "mmlu_econometrics" + ], + "mmlu_other": [ + "mmlu_nutrition", + "mmlu_human_aging", + "mmlu_management", + "mmlu_college_medicine", + "mmlu_professional_accounting", + "mmlu_miscellaneous", + "mmlu_business_ethics", + "mmlu_professional_medicine", + "mmlu_medical_genetics", + "mmlu_virology", + "mmlu_clinical_knowledge", + "mmlu_global_facts", + "mmlu_marketing" + ], + "mmlu_stem": [ + "mmlu_computer_security", + "mmlu_abstract_algebra", + "mmlu_high_school_computer_science", + "mmlu_machine_learning", + "mmlu_high_school_chemistry", + "mmlu_college_physics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_high_school_statistics", + "mmlu_college_computer_science", + "mmlu_high_school_biology", + "mmlu_high_school_physics", + "mmlu_college_chemistry", + "mmlu_college_mathematics", + "mmlu_college_biology", + "mmlu_high_school_mathematics", + "mmlu_astronomy", + "mmlu_anatomy", + "mmlu_conceptual_physics" + ], + "mmlu": [ + "mmlu_stem", + "mmlu_other", + "mmlu_social_sciences", + "mmlu_humanities" + ], + "openllm": [ + "arc_challenge", + "hellaswag", + "truthfulqa_gen", + "truthfulqa_mc2", + "truthfulqa_mc1", + "mmlu", + "winogrande", + "gsm8k" + ] + }, + "configs": { + "arc_challenge": { + "task": "arc_challenge", + "tag": [ + "ai2_arc" + ], + "dataset_path": "allenai/ai2_arc", + "dataset_name": "ARC-Challenge", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "fewshot_split": "train", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "unsafe_code": false, + "doc_to_choice": "{{choices.text}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 25, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "gsm8k": { + "task": "gsm8k", + "tag": [ + "math_word_problems" + ], + "dataset_path": "gsm8k", + "dataset_name": "main", + "training_split": "train", + "test_split": "test", + "fewshot_split": "train", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{answer}}", + "unsafe_code": false, + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": false, + "regexes_to_ignore": [ + ",", + "\\$", + "(?s).*#### ", + "\\.$" + ] + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "Question:", + "", + "<|im_end|>" + ], + "do_sample": false, + "temperature": 0.0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "strict-match", + "filter": [ + { + "function": "regex", + "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" + }, + { + "function": "take_first" + } + ] + }, + { + "name": "flexible-extract", + "filter": [ + { + "function": "regex", + "group_select": -1, + "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": false, + "metadata": { + "version": 3.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "hellaswag": { + "task": "hellaswag", + "tag": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "train", + "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "unsafe_code": false, + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "abstract_algebra", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "anatomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "astronomy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "business_ethics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "clinical_knowledge", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "college_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "computer_security", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "conceptual_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "econometrics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "electrical_engineering", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "elementary_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "formal_logic", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "global_facts", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_biology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_chemistry", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_computer_science", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_european_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_geography", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_government_and_politics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_macroeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_mathematics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_microeconomics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_physics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_statistics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_us_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "high_school_world_history", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_aging", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "human_sexuality", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "international_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "jurisprudence", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "logical_fallacies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "tag": "mmlu_stem_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "machine_learning", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "management", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "marketing", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "medical_genetics", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "miscellaneous", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_disputes", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "moral_scenarios", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "nutrition", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "philosophy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "prehistory", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_accounting", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_law", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_medicine", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "professional_psychology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "public_relations", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "security_studies", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "sociology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "tag": "mmlu_social_sciences_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "us_foreign_policy", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "tag": "mmlu_other_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "virology", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "tag": "mmlu_humanities_tasks", + "dataset_path": "cais/mmlu", + "dataset_name": "world_religions", + "dataset_kwargs": { + "trust_remote_code": true + }, + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "unsafe_code": false, + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_gen": { + "task": "truthfulqa_gen", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "generation", + "validation_split": "validation", + "process_docs": "def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:\n return dataset.map(preprocess_function)\n", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question}}", + "doc_to_target": " ", + "unsafe_code": false, + "process_results": "def process_results_gen(doc, results):\n completion = results[0]\n true_refs, false_refs = doc[\"correct_answers\"], doc[\"incorrect_answers\"]\n all_refs = true_refs + false_refs\n\n # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.\n\n # # BLEURT\n # bleurt_scores_true = self.bleurt.compute(\n # predictions=[completion] * len(true_refs), references=true_refs\n # )[\"scores\"]\n # bleurt_scores_false = self.bleurt.compute(\n # predictions=[completion] * len(false_refs), references=false_refs\n # )[\"scores\"]\n # bleurt_correct = max(bleurt_scores_true)\n # bleurt_incorrect = max(bleurt_scores_false)\n # bleurt_max = bleurt_correct\n # bleurt_diff = bleurt_correct - bleurt_incorrect\n # bleurt_acc = int(bleurt_correct > bleurt_incorrect)\n\n # BLEU\n bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs]\n bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])\n bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])\n bleu_max = bleu_correct\n bleu_diff = bleu_correct - bleu_incorrect\n bleu_acc = int(bleu_correct > bleu_incorrect)\n\n # ROUGE-N\n rouge_scores = [rouge([ref], [completion]) for ref in all_refs]\n # ROUGE-1\n rouge1_scores = [score[\"rouge1\"] for score in rouge_scores]\n rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])\n rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])\n rouge1_max = rouge1_correct\n rouge1_diff = rouge1_correct - rouge1_incorrect\n rouge1_acc = int(rouge1_correct > rouge1_incorrect)\n # ROUGE-2\n rouge2_scores = [score[\"rouge2\"] for score in rouge_scores]\n rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])\n rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])\n rouge2_max = rouge2_correct\n rouge2_diff = rouge2_correct - rouge2_incorrect\n rouge2_acc = int(rouge2_correct > rouge2_incorrect)\n # ROUGE-L\n rougeL_scores = [score[\"rougeLsum\"] for score in rouge_scores]\n rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])\n rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])\n rougeL_max = rougeL_correct\n rougeL_diff = rougeL_correct - rougeL_incorrect\n rougeL_acc = int(rougeL_correct > rougeL_incorrect)\n\n return {\n # \"bleurt_max\": bleurt_max,\n # \"bleurt_acc\": bleurt_acc,\n # \"bleurt_diff\": bleurt_diff,\n \"bleu_max\": bleu_max,\n \"bleu_acc\": bleu_acc,\n \"bleu_diff\": bleu_diff,\n \"rouge1_max\": rouge1_max,\n \"rouge1_acc\": rouge1_acc,\n \"rouge1_diff\": rouge1_diff,\n \"rouge2_max\": rouge2_max,\n \"rouge2_acc\": rouge2_acc,\n \"rouge2_diff\": rouge2_diff,\n \"rougeL_max\": rougeL_max,\n \"rougeL_acc\": rougeL_acc,\n \"rougeL_diff\": rougeL_diff,\n }\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "bleu_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_diff", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n\n" + ], + "do_sample": false, + "temperature": 0 + }, + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_mc1": { + "task": "truthfulqa_mc1", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "unsafe_code": false, + "doc_to_choice": "{{mc1_targets.choices}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 2.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "truthfulqa_mc2": { + "task": "truthfulqa_mc2", + "tag": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "unsafe_code": false, + "doc_to_choice": "{{mc2_targets.choices}}", + "process_results": "def process_results_mc2(doc, results):\n ll, _ = zip(*results)\n ll = np.array(ll)\n\n # Convert log-likelihoods to probabilities.\n probs = np.exp(ll)\n\n # Normalize probabilities.\n probs_norm = probs / np.sum(probs)\n\n labels = np.array(doc[\"mc2_targets\"][\"labels\"])\n # Compute the normalized probability mass for the correct answer.\n pm_true = np.sum(probs_norm[labels == 1])\n\n return {\"acc\": pm_true}\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 3.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + }, + "winogrande": { + "task": "winogrande", + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "dataset_kwargs": { + "trust_remote_code": true + }, + "training_split": "train", + "validation_split": "validation", + "fewshot_split": "train", + "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", + "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", + "unsafe_code": false, + "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "sentence", + "metadata": { + "version": 1.0, + "pretrained": "/models/zephyr-8x7b-sft-full/", + "attn_implementation": "flash_attention_2" + } + } + }, + "versions": { + "arc_challenge": 1.0, + "gsm8k": 3.0, + "hellaswag": 1.0, + "mmlu": 2, + "mmlu_abstract_algebra": 1.0, + "mmlu_anatomy": 1.0, + "mmlu_astronomy": 1.0, + "mmlu_business_ethics": 1.0, + "mmlu_clinical_knowledge": 1.0, + "mmlu_college_biology": 1.0, + "mmlu_college_chemistry": 1.0, + "mmlu_college_computer_science": 1.0, + "mmlu_college_mathematics": 1.0, + "mmlu_college_medicine": 1.0, + "mmlu_college_physics": 1.0, + "mmlu_computer_security": 1.0, + "mmlu_conceptual_physics": 1.0, + "mmlu_econometrics": 1.0, + "mmlu_electrical_engineering": 1.0, + "mmlu_elementary_mathematics": 1.0, + "mmlu_formal_logic": 1.0, + "mmlu_global_facts": 1.0, + "mmlu_high_school_biology": 1.0, + "mmlu_high_school_chemistry": 1.0, + "mmlu_high_school_computer_science": 1.0, + "mmlu_high_school_european_history": 1.0, + "mmlu_high_school_geography": 1.0, + "mmlu_high_school_government_and_politics": 1.0, + "mmlu_high_school_macroeconomics": 1.0, + "mmlu_high_school_mathematics": 1.0, + "mmlu_high_school_microeconomics": 1.0, + "mmlu_high_school_physics": 1.0, + "mmlu_high_school_psychology": 1.0, + "mmlu_high_school_statistics": 1.0, + "mmlu_high_school_us_history": 1.0, + "mmlu_high_school_world_history": 1.0, + "mmlu_human_aging": 1.0, + "mmlu_human_sexuality": 1.0, + "mmlu_humanities": 2, + "mmlu_international_law": 1.0, + "mmlu_jurisprudence": 1.0, + "mmlu_logical_fallacies": 1.0, + "mmlu_machine_learning": 1.0, + "mmlu_management": 1.0, + "mmlu_marketing": 1.0, + "mmlu_medical_genetics": 1.0, + "mmlu_miscellaneous": 1.0, + "mmlu_moral_disputes": 1.0, + "mmlu_moral_scenarios": 1.0, + "mmlu_nutrition": 1.0, + "mmlu_other": 2, + "mmlu_philosophy": 1.0, + "mmlu_prehistory": 1.0, + "mmlu_professional_accounting": 1.0, + "mmlu_professional_law": 1.0, + "mmlu_professional_medicine": 1.0, + "mmlu_professional_psychology": 1.0, + "mmlu_public_relations": 1.0, + "mmlu_security_studies": 1.0, + "mmlu_social_sciences": 2, + "mmlu_sociology": 1.0, + "mmlu_stem": 2, + "mmlu_us_foreign_policy": 1.0, + "mmlu_virology": 1.0, + "mmlu_world_religions": 1.0, + "truthfulqa_gen": 3.0, + "truthfulqa_mc1": 2.0, + "truthfulqa_mc2": 3.0, + "winogrande": 1.0 + }, + "n-shot": { + "arc_challenge": 25, + "gsm8k": 5, + "hellaswag": 10, + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_sociology": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5, + "truthfulqa_gen": 0, + "truthfulqa_mc1": 0, + "truthfulqa_mc2": 0, + "winogrande": 5 + }, + "higher_is_better": { + "arc_challenge": { + "acc": true, + "acc_norm": true + }, + "gsm8k": { + "exact_match": true + }, + "hellaswag": { + "acc": true, + "acc_norm": true + }, + "mmlu": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_abstract_algebra": { + "acc": true + }, + "mmlu_anatomy": { + "acc": true + }, + "mmlu_astronomy": { + "acc": true + }, + "mmlu_business_ethics": { + "acc": true + }, + "mmlu_clinical_knowledge": { + "acc": true + }, + "mmlu_college_biology": { + "acc": true + }, + "mmlu_college_chemistry": { + "acc": true + }, + "mmlu_college_computer_science": { + "acc": true + }, + "mmlu_college_mathematics": { + "acc": true + }, + "mmlu_college_medicine": { + "acc": true + }, + "mmlu_college_physics": { + "acc": true + }, + "mmlu_computer_security": { + "acc": true + }, + "mmlu_conceptual_physics": { + "acc": true + }, + "mmlu_econometrics": { + "acc": true + }, + "mmlu_electrical_engineering": { + "acc": true + }, + "mmlu_elementary_mathematics": { + "acc": true + }, + "mmlu_formal_logic": { + "acc": true + }, + "mmlu_global_facts": { + "acc": true + }, + "mmlu_high_school_biology": { + "acc": true + }, + "mmlu_high_school_chemistry": { + "acc": true + }, + "mmlu_high_school_computer_science": { + "acc": true + }, + "mmlu_high_school_european_history": { + "acc": true + }, + "mmlu_high_school_geography": { + "acc": true + }, + "mmlu_high_school_government_and_politics": { + "acc": true + }, + "mmlu_high_school_macroeconomics": { + "acc": true + }, + "mmlu_high_school_mathematics": { + "acc": true + }, + "mmlu_high_school_microeconomics": { + "acc": true + }, + "mmlu_high_school_physics": { + "acc": true + }, + "mmlu_high_school_psychology": { + "acc": true + }, + "mmlu_high_school_statistics": { + "acc": true + }, + "mmlu_high_school_us_history": { + "acc": true + }, + "mmlu_high_school_world_history": { + "acc": true + }, + "mmlu_human_aging": { + "acc": true + }, + "mmlu_human_sexuality": { + "acc": true + }, + "mmlu_humanities": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_international_law": { + "acc": true + }, + "mmlu_jurisprudence": { + "acc": true + }, + "mmlu_logical_fallacies": { + "acc": true + }, + "mmlu_machine_learning": { + "acc": true + }, + "mmlu_management": { + "acc": true + }, + "mmlu_marketing": { + "acc": true + }, + "mmlu_medical_genetics": { + "acc": true + }, + "mmlu_miscellaneous": { + "acc": true + }, + "mmlu_moral_disputes": { + "acc": true + }, + "mmlu_moral_scenarios": { + "acc": true + }, + "mmlu_nutrition": { + "acc": true + }, + "mmlu_other": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_philosophy": { + "acc": true + }, + "mmlu_prehistory": { + "acc": true + }, + "mmlu_professional_accounting": { + "acc": true + }, + "mmlu_professional_law": { + "acc": true + }, + "mmlu_professional_medicine": { + "acc": true + }, + "mmlu_professional_psychology": { + "acc": true + }, + "mmlu_public_relations": { + "acc": true + }, + "mmlu_security_studies": { + "acc": true + }, + "mmlu_social_sciences": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_sociology": { + "acc": true + }, + "mmlu_stem": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "mmlu_us_foreign_policy": { + "acc": true + }, + "mmlu_virology": { + "acc": true + }, + "mmlu_world_religions": { + "acc": true + }, + "openllm": { + "acc": true, + "acc_norm": true, + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true, + "exact_match": true + }, + "truthfulqa_gen": { + "bleu_max": true, + "bleu_acc": true, + "bleu_diff": true, + "rouge1_max": true, + "rouge1_acc": true, + "rouge1_diff": true, + "rouge2_max": true, + "rouge2_acc": true, + "rouge2_diff": true, + "rougeL_max": true, + "rougeL_acc": true, + "rougeL_diff": true + }, + "truthfulqa_mc1": { + "acc": true + }, + "truthfulqa_mc2": { + "acc": true + }, + "winogrande": { + "acc": true + } + }, + "n-samples": { + "arc_challenge": { + "original": 1172, + "effective": 1172 + }, + "hellaswag": { + "original": 10042, + "effective": 10042 + }, + "truthfulqa_gen": { + "original": 817, + "effective": 817 + }, + "truthfulqa_mc2": { + "original": 817, + "effective": 817 + }, + "truthfulqa_mc1": { + "original": 817, + "effective": 817 + }, + "mmlu_computer_security": { + "original": 100, + "effective": 100 + }, + "mmlu_abstract_algebra": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_machine_learning": { + "original": 112, + "effective": 112 + }, + "mmlu_high_school_chemistry": { + "original": 203, + "effective": 203 + }, + "mmlu_college_physics": { + "original": 102, + "effective": 102 + }, + "mmlu_electrical_engineering": { + "original": 145, + "effective": 145 + }, + "mmlu_elementary_mathematics": { + "original": 378, + "effective": 378 + }, + "mmlu_high_school_statistics": { + "original": 216, + "effective": 216 + }, + "mmlu_college_computer_science": { + "original": 100, + "effective": 100 + }, + "mmlu_high_school_biology": { + "original": 310, + "effective": 310 + }, + "mmlu_high_school_physics": { + "original": 151, + "effective": 151 + }, + "mmlu_college_chemistry": { + "original": 100, + "effective": 100 + }, + "mmlu_college_mathematics": { + "original": 100, + "effective": 100 + }, + "mmlu_college_biology": { + "original": 144, + "effective": 144 + }, + "mmlu_high_school_mathematics": { + "original": 270, + "effective": 270 + }, + "mmlu_astronomy": { + "original": 152, + "effective": 152 + }, + "mmlu_anatomy": { + "original": 135, + "effective": 135 + }, + "mmlu_conceptual_physics": { + "original": 235, + "effective": 235 + }, + "mmlu_nutrition": { + "original": 306, + "effective": 306 + }, + "mmlu_human_aging": { + "original": 223, + "effective": 223 + }, + "mmlu_management": { + "original": 103, + "effective": 103 + }, + "mmlu_college_medicine": { + "original": 173, + "effective": 173 + }, + "mmlu_professional_accounting": { + "original": 282, + "effective": 282 + }, + "mmlu_miscellaneous": { + "original": 783, + "effective": 783 + }, + "mmlu_business_ethics": { + "original": 100, + "effective": 100 + }, + "mmlu_professional_medicine": { + "original": 272, + "effective": 272 + }, + "mmlu_medical_genetics": { + "original": 100, + "effective": 100 + }, + "mmlu_virology": { + "original": 166, + "effective": 166 + }, + "mmlu_clinical_knowledge": { + "original": 265, + "effective": 265 + }, + "mmlu_global_facts": { + "original": 100, + "effective": 100 + }, + "mmlu_marketing": { + "original": 234, + "effective": 234 + }, + "mmlu_public_relations": { + "original": 110, + "effective": 110 + }, + "mmlu_high_school_macroeconomics": { + "original": 390, + "effective": 390 + }, + "mmlu_high_school_psychology": { + "original": 545, + "effective": 545 + }, + "mmlu_sociology": { + "original": 201, + "effective": 201 + }, + "mmlu_professional_psychology": { + "original": 612, + "effective": 612 + }, + "mmlu_us_foreign_policy": { + "original": 100, + "effective": 100 + }, + "mmlu_security_studies": { + "original": 245, + "effective": 245 + }, + "mmlu_high_school_geography": { + "original": 198, + "effective": 198 + }, + "mmlu_high_school_government_and_politics": { + "original": 193, + "effective": 193 + }, + "mmlu_human_sexuality": { + "original": 131, + "effective": 131 + }, + "mmlu_high_school_microeconomics": { + "original": 238, + "effective": 238 + }, + "mmlu_econometrics": { + "original": 114, + "effective": 114 + }, + "mmlu_formal_logic": { + "original": 126, + "effective": 126 + }, + "mmlu_high_school_world_history": { + "original": 237, + "effective": 237 + }, + "mmlu_jurisprudence": { + "original": 108, + "effective": 108 + }, + "mmlu_moral_disputes": { + "original": 346, + "effective": 346 + }, + "mmlu_moral_scenarios": { + "original": 895, + "effective": 895 + }, + "mmlu_international_law": { + "original": 121, + "effective": 121 + }, + "mmlu_world_religions": { + "original": 171, + "effective": 171 + }, + "mmlu_professional_law": { + "original": 1534, + "effective": 1534 + }, + "mmlu_high_school_european_history": { + "original": 165, + "effective": 165 + }, + "mmlu_high_school_us_history": { + "original": 204, + "effective": 204 + }, + "mmlu_prehistory": { + "original": 324, + "effective": 324 + }, + "mmlu_logical_fallacies": { + "original": 163, + "effective": 163 + }, + "mmlu_philosophy": { + "original": 311, + "effective": 311 + }, + "winogrande": { + "original": 1267, + "effective": 1267 + }, + "gsm8k": { + "original": 1319, + "effective": 1319 + } + }, + "config": { + "model": "hf", + "model_args": "pretrained=/models/zephyr-8x7b-sft-full/,attn_implementation=flash_attention_2", + "model_num_parameters": 46702792704, + "model_dtype": "torch.bfloat16", + "model_revision": "main", + "model_sha": "", + "batch_size": "64", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null, + "random_seed": 0, + "numpy_seed": 1234, + "torch_seed": 1234, + "fewshot_seed": 1234 + }, + "git_hash": "8bc4aff", + "date": 1748643897.7621276, + "pretty_env_info": "PyTorch version: 2.7.0a0+git6374332\nIs debug build: False\nCUDA used to build PyTorch: N/A\nROCM used to build PyTorch: 6.3.42134-a9a80e791\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: 18.0.0git (https://github.com/RadeonOpenCompute/llvm-project roc-6.3.4 25012 e5bf7e55c91490b07c49d8960fa7983d864936c4)\nCMake version: version 3.31.6\nLibc version: glibc-2.35\n\nPython version: 3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-72-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: AMD Instinct MI300X (gfx942:sramecc+:xnack-)\nNvidia driver version: Could not collect\ncuDNN version: Could not collect\nHIP runtime version: 6.3.42134\nMIOpen runtime version: 3.3.0\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 52 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 9554 64-Core Processor\nCPU family: 25\nModel: 17\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3762.9880\nCPU min MHz: 1500.0000\nBogoMIPS: 6190.45\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid overflow_recov succor smca fsrm flush_l1d\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 128 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] mypy==1.9.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] optree==0.14.1\n[pip3] torch==2.7.0a0+git6374332\n[pip3] torchao==0.10.0.dev20250324+rocm6.3\n[pip3] torchdata==0.11.0\n[pip3] torchtune==0.0.0\n[pip3] torchvision==0.22.0a0+956025b\n[pip3] triton==3.2.0\n[conda] No relevant packages", + "transformers_version": "4.46.3", + "lm_eval_version": "0.4.8", + "upper_git_hash": null, + "tokenizer_pad_token": [ + "", + "2" + ], + "tokenizer_eos_token": [ + "", + "2" + ], + "tokenizer_bos_token": [ + "", + "1" + ], + "eot_token_id": 2, + "max_length": 32768, + "task_hashes": {}, + "model_source": "hf", + "model_name": "/models/zephyr-8x7b-sft-full/", + "model_name_sanitized": "__models__zephyr-8x7b-sft-full__", + "system_instruction": null, + "system_instruction_sha": null, + "fewshot_as_multiturn": false, + "chat_template": null, + "chat_template_sha": null, + "start_time": 2912944.423443072, + "end_time": 2913929.340807741, + "total_evaluation_time_seconds": "984.9173646690324" +} \ No newline at end of file