[ { "task": "hellaswag", "benchmark": "HellaSwag", "metric": "acc_norm", "score": 0.26847241585341564, "shots": 0, "runtime_sec": 157.33, "status": "success" }, { "task": "piqa", "benchmark": "PIQA", "metric": "acc_norm", "score": 0.4923830250272035, "shots": 0, "runtime_sec": 34.0, "status": "success" }, { "task": "winogrande", "benchmark": "WinoGrande", "metric": "acc", "score": 0.4996053670086819, "shots": 0, "runtime_sec": 26.59, "status": "success" }, { "task": "boolq", "benchmark": "BoolQ", "metric": "acc", "score": 0.6085626911314985, "shots": 0, "runtime_sec": 54.76, "status": "success" }, { "task": "arc_easy", "benchmark": "ARC-Easy", "metric": "acc_norm", "score": 0.27104377104377103, "shots": 0, "runtime_sec": 41.15, "status": "success" }, { "task": "arc_challenge", "benchmark": "ARC-Challenge", "metric": "acc_norm", "score": 0.27559726962457337, "shots": 0, "runtime_sec": 31.75, "status": "success" }, { "task": "openbookqa", "benchmark": "OpenBookQA", "metric": "acc_norm", "score": 0.298, "shots": 0, "runtime_sec": 26.69, "status": "success" }, { "task": "commonsense_qa", "benchmark": "CommonsenseQA", "metric": "acc", "score": 0.20802620802620803, "shots": 0, "runtime_sec": 30.66, "status": "success" }, { "task": "lambada_openai", "benchmark": "LAMBADA", "metric": "acc", "score": 0.0, "shots": 0, "runtime_sec": 68.38, "status": "success" }, { "task": "blimp", "benchmark": "BLiMP", "metric": "acc", "score": 0.536134328358209, "shots": 0, "runtime_sec": 471.77, "status": "success" }, { "task": "mmlu", "benchmark": "MMLU", "metric": "acc", "score": 0.2416322461187865, "shots": 0, "runtime_sec": 532.07, "status": "success" }, { "task": "wikitext", "benchmark": "WikiText-2", "metric": "word_perplexity", "score": 39301278.79233013, "shots": 0, "runtime_sec": 46.38, "status": "success" }, { "task": "wikitext", "benchmark": "WikiText-2", "metric": "byte_perplexity", "score": 26.31431152931224, "shots": 0, "runtime_sec": 39.52, "status": "success" }, { "task": "sciq", "benchmark": "SciQ", "metric": "acc_norm", "score": 0.2, "shots": 0, "runtime_sec": 48.87, "status": "success" }, { "task": "copa", "benchmark": "COPA", "metric": "acc", "score": 0.63, "shots": 0, "runtime_sec": 22.29, "status": "success" }, { "task": "race", "benchmark": "RACE", "metric": "acc", "score": 0.22392344497607655, "shots": 0, "runtime_sec": 123.26, "status": "success" }, { "task": "swag", "benchmark": "SWAG", "metric": "acc_norm", "score": 0.2589723083075077, "shots": 0, "runtime_sec": 141.34, "status": "success" }, { "task": "truthfulqa_mc2", "benchmark": "TruthfulQA MC2", "metric": "acc", "score": 0.4981403502526444, "shots": 0, "runtime_sec": 52.22, "status": "success" } ]