| from datasets import DatasetDict, load_from_disk |
| import argparse |
|
|
| from openai_dataset_maker import features |
|
|
| def has_all_valid_labels(exp): |
| for col, labels in exp.items(): |
| if col in {"text", "tokens"}: |
| continue |
| for label in labels: |
| if label not in features[col]: |
| return False |
| return True |
|
|
| def is_evenly_shaped(exp): |
| cnt_set = set() |
| for col, labels in exp.items(): |
| if col == "text": |
| continue |
| cnt_set.add(len(labels)) |
| return len(cnt_set) == 1 |
|
|
|
|
| if __name__ == '__main__': |
| arg_parser = argparse.ArgumentParser(description="Train multi-task model.") |
| arg_parser.add_argument("data_path", help="Load dataset from specified path.", |
| action="store") |
| arg_parser.add_argument("--save-path", help="Save final dataset to specified path.", |
| action="store", default="./training_data") |
| args = arg_parser.parse_args() |
|
|
| loaded_dataset = load_from_disk(args.data_path) |
| loaded_dataset = loaded_dataset.filter(is_evenly_shaped) |
| loaded_dataset = loaded_dataset.filter(has_all_valid_labels) |
|
|
| first_split = loaded_dataset.train_test_split(shuffle=True, seed=42, test_size=0.09) |
| second_split = first_split["train"].train_test_split(test_size=0.1) |
|
|
| new_ds = DatasetDict() |
| new_ds["test"] = first_split["test"] |
| new_ds["train"] = second_split["train"] |
| new_ds["validation"] = second_split["test"] |
| new_ds.save_to_disk(args.save_path) |
|
|