| { |
| "results": { |
| "arc_challenge": { |
| "acc,none": 0.3199658703071672, |
| "acc_stderr,none": 0.013631345807016193, |
| "acc_norm,none": 0.3532423208191126, |
| "acc_norm_stderr,none": 0.013967822714840055 |
| }, |
| "arc_easy": { |
| "acc,none": 0.6721380471380471, |
| "acc_stderr,none": 0.009632587076170016, |
| "acc_norm,none": 0.5955387205387206, |
| "acc_norm_stderr,none": 0.010070746648278783 |
| }, |
| "lambada_openai": { |
| "perplexity,none": 4.855671424502843, |
| "perplexity_stderr,none": 0.11455715396173873, |
| "acc,none": 0.6574810789831166, |
| "acc_stderr,none": 0.006611438859225025 |
| }, |
| "logiqa": { |
| "acc,none": 0.23348694316436253, |
| "acc_stderr,none": 0.016593362460570887, |
| "acc_norm,none": 0.2749615975422427, |
| "acc_norm_stderr,none": 0.017512971782225207 |
| }, |
| "piqa": { |
| "acc,none": 0.7557127312295974, |
| "acc_stderr,none": 0.010024765172284218, |
| "acc_norm,none": 0.7665941240478781, |
| "acc_norm_stderr,none": 0.009869247889521015 |
| }, |
| "sciq": { |
| "acc,none": 0.898, |
| "acc_stderr,none": 0.009575368801653895, |
| "acc_norm,none": 0.837, |
| "acc_norm_stderr,none": 0.011686212712746839 |
| }, |
| "winogrande": { |
| "acc,none": 0.6172059984214681, |
| "acc_stderr,none": 0.013660946109442016 |
| }, |
| "wsc": { |
| "acc,none": 0.36538461538461536, |
| "acc_stderr,none": 0.0474473339327792 |
| } |
| }, |
| "configs": { |
| "arc_challenge": { |
| "task": "arc_challenge", |
| "group": [ |
| "ai2_arc" |
| ], |
| "dataset_path": "ai2_arc", |
| "dataset_name": "ARC-Challenge", |
| "training_split": "train", |
| "validation_split": "validation", |
| "test_split": "test", |
| "doc_to_text": "Question: {{question}}\nAnswer:", |
| "doc_to_target": "{{choices.label.index(answerKey)}}", |
| "doc_to_choice": "{{choices.text}}", |
| "description": "", |
| "target_delimiter": " ", |
| "fewshot_delimiter": "\n\n", |
| "num_fewshot": 0, |
| "metric_list": [ |
| { |
| "metric": "acc", |
| "aggregation": "mean", |
| "higher_is_better": true |
| }, |
| { |
| "metric": "acc_norm", |
| "aggregation": "mean", |
| "higher_is_better": true |
| } |
| ], |
| "output_type": "multiple_choice", |
| "repeats": 1, |
| "should_decontaminate": true, |
| "doc_to_decontamination_query": "Question: {{question}}\nAnswer:" |
| }, |
| "arc_easy": { |
| "task": "arc_easy", |
| "group": [ |
| "ai2_arc" |
| ], |
| "dataset_path": "ai2_arc", |
| "dataset_name": "ARC-Easy", |
| "training_split": "train", |
| "validation_split": "validation", |
| "test_split": "test", |
| "doc_to_text": "Question: {{question}}\nAnswer:", |
| "doc_to_target": "{{choices.label.index(answerKey)}}", |
| "doc_to_choice": "{{choices.text}}", |
| "description": "", |
| "target_delimiter": " ", |
| "fewshot_delimiter": "\n\n", |
| "num_fewshot": 0, |
| "metric_list": [ |
| { |
| "metric": "acc", |
| "aggregation": "mean", |
| "higher_is_better": true |
| }, |
| { |
| "metric": "acc_norm", |
| "aggregation": "mean", |
| "higher_is_better": true |
| } |
| ], |
| "output_type": "multiple_choice", |
| "repeats": 1, |
| "should_decontaminate": true, |
| "doc_to_decontamination_query": "Question: {{question}}\nAnswer:" |
| }, |
| "lambada_openai": { |
| "task": "lambada_openai", |
| "group": [ |
| "lambada" |
| ], |
| "dataset_path": "EleutherAI/lambada_openai", |
| "dataset_name": "default", |
| "test_split": "test", |
| "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}", |
| "doc_to_target": "{{' '+text.split(' ')[-1]}}", |
| "description": "", |
| "target_delimiter": " ", |
| "fewshot_delimiter": "\n\n", |
| "num_fewshot": 0, |
| "metric_list": [ |
| { |
| "metric": "perplexity", |
| "aggregation": "perplexity", |
| "higher_is_better": false |
| }, |
| { |
| "metric": "acc", |
| "aggregation": "mean", |
| "higher_is_better": true |
| } |
| ], |
| "output_type": "loglikelihood", |
| "repeats": 1, |
| "should_decontaminate": true, |
| "doc_to_decontamination_query": "{{text}}" |
| }, |
| "logiqa": { |
| "task": "logiqa", |
| "dataset_path": "EleutherAI/logiqa", |
| "dataset_name": "logiqa", |
| "training_split": "train", |
| "validation_split": "validation", |
| "test_split": "test", |
| "doc_to_text": "<function doc_to_text at 0x7f38920d0dc0>", |
| "doc_to_target": "<function doc_to_target at 0x7f38920d1120>", |
| "doc_to_choice": "{{options}}", |
| "description": "", |
| "target_delimiter": " ", |
| "fewshot_delimiter": "\n\n", |
| "num_fewshot": 0, |
| "metric_list": [ |
| { |
| "metric": "acc", |
| "aggregation": "mean", |
| "higher_is_better": true |
| }, |
| { |
| "metric": "acc_norm", |
| "aggregation": "mean", |
| "higher_is_better": true |
| } |
| ], |
| "output_type": "multiple_choice", |
| "repeats": 1, |
| "should_decontaminate": true, |
| "doc_to_decontamination_query": "{{context}}" |
| }, |
| "piqa": { |
| "task": "piqa", |
| "dataset_path": "piqa", |
| "training_split": "train", |
| "validation_split": "validation", |
| "doc_to_text": "Question: {{goal}}\nAnswer:", |
| "doc_to_target": "label", |
| "doc_to_choice": "{{[sol1, sol2]}}", |
| "description": "", |
| "target_delimiter": " ", |
| "fewshot_delimiter": "\n\n", |
| "num_fewshot": 0, |
| "metric_list": [ |
| { |
| "metric": "acc", |
| "aggregation": "mean", |
| "higher_is_better": true |
| }, |
| { |
| "metric": "acc_norm", |
| "aggregation": "mean", |
| "higher_is_better": true |
| } |
| ], |
| "output_type": "multiple_choice", |
| "repeats": 1, |
| "should_decontaminate": true, |
| "doc_to_decontamination_query": "goal" |
| }, |
| "sciq": { |
| "task": "sciq", |
| "dataset_path": "sciq", |
| "training_split": "train", |
| "validation_split": "validation", |
| "test_split": "test", |
| "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:", |
| "doc_to_target": 3, |
| "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}", |
| "description": "", |
| "target_delimiter": " ", |
| "fewshot_delimiter": "\n\n", |
| "num_fewshot": 0, |
| "metric_list": [ |
| { |
| "metric": "acc", |
| "aggregation": "mean", |
| "higher_is_better": true |
| }, |
| { |
| "metric": "acc_norm", |
| "aggregation": "mean", |
| "higher_is_better": true |
| } |
| ], |
| "output_type": "multiple_choice", |
| "repeats": 1, |
| "should_decontaminate": true, |
| "doc_to_decontamination_query": "{{support}} {{question}}" |
| }, |
| "winogrande": { |
| "task": "winogrande", |
| "dataset_path": "winogrande", |
| "dataset_name": "winogrande_xl", |
| "training_split": "train", |
| "validation_split": "validation", |
| "doc_to_text": "<function doc_to_text at 0x7f3892141000>", |
| "doc_to_target": "<function doc_to_target at 0x7f3892141360>", |
| "doc_to_choice": "<function doc_to_choice at 0x7f38921416c0>", |
| "description": "", |
| "target_delimiter": " ", |
| "fewshot_delimiter": "\n\n", |
| "num_fewshot": 0, |
| "metric_list": [ |
| { |
| "metric": "acc", |
| "aggregation": "mean", |
| "higher_is_better": true |
| } |
| ], |
| "output_type": "multiple_choice", |
| "repeats": 1, |
| "should_decontaminate": true, |
| "doc_to_decontamination_query": "sentence" |
| }, |
| "wsc": { |
| "task": "wsc", |
| "group": [ |
| "super-glue-lm-eval-v1" |
| ], |
| "dataset_path": "super_glue", |
| "dataset_name": "wsc.fixed", |
| "training_split": "train", |
| "validation_split": "validation", |
| "doc_to_text": "<function default_doc_to_text at 0x7f3891fd0a60>", |
| "doc_to_target": "label", |
| "doc_to_choice": [ |
| "no", |
| "yes" |
| ], |
| "description": "", |
| "target_delimiter": " ", |
| "fewshot_delimiter": "\n\n", |
| "num_fewshot": 0, |
| "metric_list": [ |
| { |
| "metric": "acc" |
| } |
| ], |
| "output_type": "multiple_choice", |
| "repeats": 1, |
| "should_decontaminate": false |
| } |
| }, |
| "versions": { |
| "arc_challenge": "Yaml", |
| "arc_easy": "Yaml", |
| "lambada_openai": "Yaml", |
| "logiqa": "Yaml", |
| "piqa": "Yaml", |
| "sciq": "Yaml", |
| "winogrande": "Yaml", |
| "wsc": "Yaml" |
| }, |
| "config": { |
| "model": "hf", |
| "model_args": "pretrained=usvsnsp/pythia-6.9b-ppo,dtype=float16", |
| "batch_size": 1, |
| "batch_sizes": [], |
| "device": null, |
| "use_cache": null, |
| "limit": null, |
| "bootstrap_iters": 100000 |
| }, |
| "git_hash": "0aa37743" |
| } |