| from datasets import DatasetDict, load_from_disk | |
| import argparse | |
| from openai_dataset_maker import features | |
| def has_all_valid_labels(exp): | |
| for col, labels in exp.items(): | |
| if col in {"text", "tokens"}: | |
| continue | |
| for label in labels: | |
| if label not in features[col]: | |
| return False | |
| return True | |
| def is_evenly_shaped(exp): | |
| cnt_set = set() | |
| for col, labels in exp.items(): | |
| if col == "text": | |
| continue | |
| cnt_set.add(len(labels)) | |
| return len(cnt_set) == 1 | |
| if __name__ == '__main__': | |
| arg_parser = argparse.ArgumentParser(description="Train multi-task model.") | |
| arg_parser.add_argument("data_path", help="Load dataset from specified path.", | |
| action="store") | |
| arg_parser.add_argument("--save-path", help="Save final dataset to specified path.", | |
| action="store", default="./training_data") | |
| args = arg_parser.parse_args() | |
| loaded_dataset = load_from_disk(args.data_path) | |
| loaded_dataset = loaded_dataset.filter(is_evenly_shaped) | |
| loaded_dataset = loaded_dataset.filter(has_all_valid_labels) | |
| first_split = loaded_dataset.train_test_split(shuffle=True, seed=42, test_size=0.09) | |
| second_split = first_split["train"].train_test_split(test_size=0.1) | |
| new_ds = DatasetDict() | |
| new_ds["test"] = first_split["test"] | |
| new_ds["train"] = second_split["train"] | |
| new_ds["validation"] = second_split["test"] | |
| new_ds.save_to_disk(args.save_path) | |