Spaces:
Running
Running
| import json | |
| from os import path | |
| from typing import Dict | |
| import jsonlines | |
| def get_data_file(data_dir: str, split: str, max_segment_len: int) -> str: | |
| jsonl_file = path.join( | |
| data_dir, "{}.{}.met.jsonlines".format(split, max_segment_len) | |
| ) | |
| print("File access: ", jsonl_file) | |
| if path.exists(jsonl_file): | |
| return jsonl_file | |
| else: | |
| jsonl_file = path.join(data_dir, "{}.met.jsonlines".format(split)) | |
| if path.exists(jsonl_file): | |
| return jsonl_file | |
| def load_dataset( | |
| data_dir: str, | |
| singleton_file: str = None, | |
| max_segment_len: int = 2048, | |
| num_train_docs: int = None, | |
| num_dev_docs: int = None, | |
| num_test_docs: int = None, | |
| dataset_name: str = None, | |
| ) -> Dict: | |
| all_splits = [] | |
| for split in ["train", "dev", "test"]: | |
| jsonl_file = get_data_file(data_dir, split, max_segment_len) | |
| if jsonl_file is None: | |
| raise ValueError(f"No relevant files at {data_dir}") | |
| split_data = [] | |
| with open(jsonl_file) as f: | |
| for line in f: | |
| load_dict = json.loads(line.strip()) | |
| load_dict["dataset_name"] = dataset_name | |
| split_data.append(load_dict) | |
| all_splits.append(split_data) | |
| train_data, dev_data, test_data = all_splits | |
| if singleton_file is not None and path.exists(singleton_file): | |
| num_singletons = 0 | |
| with open(singleton_file) as f: | |
| singleton_data = json.loads(f.read()) | |
| for instance in train_data: | |
| doc_key = instance["doc_key"] | |
| if doc_key in singleton_data: | |
| if len(instance["clusters"]) != 0: | |
| num_singletons += len(singleton_data[doc_key]) | |
| instance["clusters"][-1].extend( | |
| [cluster[0] for cluster in singleton_data[doc_key]] | |
| ) | |
| print("Added %d singletons" % num_singletons) | |
| return { | |
| "train": train_data[:num_train_docs], | |
| "dev": dev_data[:num_dev_docs], | |
| "test": test_data[:num_test_docs], | |
| } | |
| def load_eval_dataset( | |
| data_dir: str, external_md_file: str, max_segment_len: int, dataset_name: str = None | |
| ) -> Dict: | |
| data_dict = {} | |
| for split in ["dev", "test"]: | |
| jsonl_file = get_data_file(data_dir, split, max_segment_len) | |
| if jsonl_file is not None: | |
| split_data = [] | |
| with open(jsonl_file) as f: | |
| for line in f: | |
| load_dict = json.loads(line.strip()) | |
| load_dict["dataset_name"] = dataset_name | |
| split_data.append(load_dict) | |
| data_dict[split] = split_data | |
| if external_md_file is not None and path.exists(external_md_file): | |
| predicted_mentions = {} | |
| with jsonlines.open(external_md_file, mode="r") as reader: | |
| for line in reader: | |
| predicted_mentions[line["doc_key"]] = line | |
| for split in ["dev", "test"]: | |
| for instance in data_dict[split]: | |
| doc_key = instance["doc_key"] | |
| if doc_key in predicted_mentions: | |
| instance["ext_predicted_mentions"] = sorted( | |
| predicted_mentions[doc_key]["pred_mentions"] | |
| ) | |
| return data_dict | |