Spaces:
Build error
Build error
| import csv | |
| from typing import List, Optional, Tuple | |
| import pkg_resources | |
| # from rich import inspect | |
| from rich.pretty import pprint | |
| from promptsource.templates import TemplateCollection | |
| def preview() -> None: | |
| experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv") | |
| gsheet = {} | |
| d4_train: List[Tuple[str, Optional[str]]] = [] | |
| d4_eval: List[Tuple[str, Optional[str]]] = [] | |
| d3_train_gpt: List[Tuple[str, Optional[str]]] = [] | |
| d3_train_sglue: List[Tuple[str, Optional[str]]] = [] | |
| experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv") | |
| with open(experiment_path) as exp_file: | |
| reader = csv.DictReader(exp_file) | |
| for row in reader: | |
| if row["skip"]: | |
| continue | |
| if row["subset"] == "": | |
| row["subset"] = None # to match promptsource.Template object | |
| dataset_subset = (row["HF_name"], row["subset"]) | |
| if row["do_train"] == "TRUE": | |
| d4_train.append(dataset_subset) | |
| if row["do_eval"] == "TRUE": | |
| d4_eval.append(dataset_subset) | |
| if row["D3_do_train"] == "TRUE" and "GPT" in row["seed_paper"]: | |
| d3_train_gpt.append(dataset_subset) | |
| if row["D3_do_train"] == "TRUE" and row["HF_name"] == "super_glue": | |
| d3_train_sglue.append(dataset_subset) | |
| gsheet[dataset_subset] = row | |
| all_datasets = d4_train + d4_eval + d3_train_gpt + d3_train_sglue | |
| print(f"Number of non-desk-rejected datasets = {len(all_datasets)}") | |
| print(f"Number of training sets = {len(d4_train)}") | |
| print(f"Number of evaluation sets = {len(d4_eval)}") | |
| template_collection = TemplateCollection() | |
| output = [] | |
| missing_og_flags = [] | |
| missing_metrics = [] | |
| for dataset_name, subset_name in template_collection.keys: | |
| ds_name = (dataset_name, subset_name) | |
| if ds_name not in d4_eval: | |
| template_collection.remove(dataset_name, subset_name) | |
| continue | |
| OG = 0 | |
| non_OG = 0 | |
| dataset = template_collection.get_dataset(dataset_name, subset_name) | |
| for template_name in dataset.all_template_names: | |
| template = dataset[template_name] | |
| # if dataset_name == 'ropes': | |
| # inspect(template.metadata) | |
| if not template.metadata.metrics: | |
| missing_metrics.append(f"{dataset_name}/{subset_name}/{template_name}") | |
| if template.metadata.original_task is True: | |
| OG += 1 | |
| elif template.metadata.original_task is False: | |
| non_OG += 1 | |
| elif template.metadata.original_task is None: | |
| missing_og_flags.append(dataset_name + "/" + template_name) | |
| continue | |
| train_size = gsheet[ds_name]["train_size"] | |
| if train_size == "": | |
| train_size = 0 | |
| else: | |
| train_size = int(train_size) | |
| adjusted_train_size = train_size // len(dataset.all_template_names) | |
| output.append( | |
| ( | |
| f"{dataset_name} {subset_name if subset_name else ''}", | |
| f"{OG}-{non_OG}", | |
| f"{train_size:,} {adjusted_train_size:,}", | |
| ) | |
| ) | |
| pprint(output) | |
| print(len(template_collection)) | |
| print("Missing metrics:") | |
| pprint(missing_metrics) | |
| print("Missing original task flags:") | |
| pprint(missing_og_flags) | |
| # # print(d4_train_mixture) | |
| # print(f"Number of training templates = {len(d4_train_mixture)}") | |
| # # print(d4_eval_mixture) | |
| # print(f"Number of evaluation templates = {len(d4_eval_mixture)}") | |
| # # for i in seqio.TaskRegistry.names(): | |
| # # print(i) | |
| # print(f"Number of SeqIO registered templates = {len(seqio.TaskRegistry.names())}") | |
| # print("^ includes non-original task templates which are excluded from the eval mixture") | |
| if __name__ == "__main__": | |
| preview() | |