Invalid JSON: Unexpected token 'N', ..."ty,none": NaN,
"... is not valid JSON
| { | |
| "results": { | |
| "arc_challenge": { | |
| "acc,none": 0.2158703071672355, | |
| "acc_stderr,none": 0.01202297536003066, | |
| "acc_norm,none": 0.2295221843003413, | |
| "acc_norm_stderr,none": 0.012288926760890799 | |
| }, | |
| "arc_easy": { | |
| "acc,none": 0.3265993265993266, | |
| "acc_stderr,none": 0.009623047038267652, | |
| "acc_norm,none": 0.3287037037037037, | |
| "acc_norm_stderr,none": 0.009638903167022162 | |
| }, | |
| "lambada_openai": { | |
| "perplexity,none": NaN, | |
| "perplexity_stderr,none": NaN, | |
| "acc,none": 0.17504366388511547, | |
| "acc_stderr,none": 0.005294204972653027 | |
| }, | |
| "logiqa": { | |
| "acc,none": 0.20276497695852536, | |
| "acc_stderr,none": 0.015770046635584564, | |
| "acc_norm,none": 0.20276497695852536, | |
| "acc_norm_stderr,none": 0.015770046635584564 | |
| }, | |
| "piqa": { | |
| "acc,none": 0.544069640914037, | |
| "acc_stderr,none": 0.011620422647622237, | |
| "acc_norm,none": 0.544613710554951, | |
| "acc_norm_stderr,none": 0.011619292444157079 | |
| }, | |
| "sciq": { | |
| "acc,none": 0.205, | |
| "acc_stderr,none": 0.012772554096113114, | |
| "acc_norm,none": 0.194, | |
| "acc_norm_stderr,none": 0.012510816141264348 | |
| }, | |
| "winogrande": { | |
| "acc,none": 0.5043409629044988, | |
| "acc_stderr,none": 0.0140519560640769 | |
| }, | |
| "wsc": { | |
| "acc,none": 0.6153846153846154, | |
| "acc_stderr,none": 0.047936688680750406 | |
| } | |
| }, | |
| "configs": { | |
| "arc_challenge": { | |
| "task": "arc_challenge", | |
| "group": [ | |
| "ai2_arc" | |
| ], | |
| "dataset_path": "ai2_arc", | |
| "dataset_name": "ARC-Challenge", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "test_split": "test", | |
| "doc_to_text": "Question: {{question}}\nAnswer:", | |
| "doc_to_target": "{{choices.label.index(answerKey)}}", | |
| "doc_to_choice": "{{choices.text}}", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| }, | |
| { | |
| "metric": "acc_norm", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "Question: {{question}}\nAnswer:" | |
| }, | |
| "arc_easy": { | |
| "task": "arc_easy", | |
| "group": [ | |
| "ai2_arc" | |
| ], | |
| "dataset_path": "ai2_arc", | |
| "dataset_name": "ARC-Easy", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "test_split": "test", | |
| "doc_to_text": "Question: {{question}}\nAnswer:", | |
| "doc_to_target": "{{choices.label.index(answerKey)}}", | |
| "doc_to_choice": "{{choices.text}}", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| }, | |
| { | |
| "metric": "acc_norm", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "Question: {{question}}\nAnswer:" | |
| }, | |
| "lambada_openai": { | |
| "task": "lambada_openai", | |
| "group": [ | |
| "lambada" | |
| ], | |
| "dataset_path": "EleutherAI/lambada_openai", | |
| "dataset_name": "default", | |
| "test_split": "test", | |
| "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}", | |
| "doc_to_target": "{{' '+text.split(' ')[-1]}}", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "perplexity", | |
| "aggregation": "perplexity", | |
| "higher_is_better": false | |
| }, | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "loglikelihood", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "{{text}}" | |
| }, | |
| "logiqa": { | |
| "task": "logiqa", | |
| "dataset_path": "EleutherAI/logiqa", | |
| "dataset_name": "logiqa", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "test_split": "test", | |
| "doc_to_text": "<function doc_to_text at 0x7fa8b39f4d30>", | |
| "doc_to_target": "<function doc_to_target at 0x7fa8b39f5090>", | |
| "doc_to_choice": "{{options}}", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| }, | |
| { | |
| "metric": "acc_norm", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "{{context}}" | |
| }, | |
| "piqa": { | |
| "task": "piqa", | |
| "dataset_path": "piqa", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "doc_to_text": "Question: {{goal}}\nAnswer:", | |
| "doc_to_target": "label", | |
| "doc_to_choice": "{{[sol1, sol2]}}", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| }, | |
| { | |
| "metric": "acc_norm", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "goal" | |
| }, | |
| "sciq": { | |
| "task": "sciq", | |
| "dataset_path": "sciq", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "test_split": "test", | |
| "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:", | |
| "doc_to_target": 3, | |
| "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| }, | |
| { | |
| "metric": "acc_norm", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "{{support}} {{question}}" | |
| }, | |
| "winogrande": { | |
| "task": "winogrande", | |
| "dataset_path": "winogrande", | |
| "dataset_name": "winogrande_xl", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "doc_to_text": "<function doc_to_text at 0x7fa8b386cf70>", | |
| "doc_to_target": "<function doc_to_target at 0x7fa8b386d2d0>", | |
| "doc_to_choice": "<function doc_to_choice at 0x7fa8b386d630>", | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc", | |
| "aggregation": "mean", | |
| "higher_is_better": true | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": true, | |
| "doc_to_decontamination_query": "sentence" | |
| }, | |
| "wsc": { | |
| "task": "wsc", | |
| "group": [ | |
| "super-glue-lm-eval-v1" | |
| ], | |
| "dataset_path": "super_glue", | |
| "dataset_name": "wsc.fixed", | |
| "training_split": "train", | |
| "validation_split": "validation", | |
| "doc_to_text": "<function default_doc_to_text at 0x7fa8b38f49d0>", | |
| "doc_to_target": "label", | |
| "doc_to_choice": [ | |
| "no", | |
| "yes" | |
| ], | |
| "description": "", | |
| "target_delimiter": " ", | |
| "fewshot_delimiter": "\n\n", | |
| "num_fewshot": 0, | |
| "metric_list": [ | |
| { | |
| "metric": "acc" | |
| } | |
| ], | |
| "output_type": "multiple_choice", | |
| "repeats": 1, | |
| "should_decontaminate": false | |
| } | |
| }, | |
| "versions": { | |
| "arc_challenge": "Yaml", | |
| "arc_easy": "Yaml", | |
| "lambada_openai": "Yaml", | |
| "logiqa": "Yaml", | |
| "piqa": "Yaml", | |
| "sciq": "Yaml", | |
| "winogrande": "Yaml", | |
| "wsc": "Yaml" | |
| }, | |
| "config": { | |
| "model": "hf", | |
| "model_args": "pretrained=usvsnsp/pythia-160m-ppo", | |
| "batch_size": 1, | |
| "batch_sizes": [], | |
| "device": null, | |
| "use_cache": null, | |
| "limit": null, | |
| "bootstrap_iters": 100000, | |
| "gen_kwargs": "" | |
| }, | |
| "git_hash": "b8d7d6c3" | |
| } |