Spaces:
Runtime error
Runtime error
| from datasets import load_dataset | |
| # Initializing global variables | |
| file_path = '/app/models/content/' | |
| """## Loading the Dataset | |
| We will be finetuning the DistilBERT model on a subset of patents filed in January 2016. We perform the train-test split as | |
| any patents filed on and before January 21st, 2016 will be part of the training set and and patents filed on January 22nd, | |
| 2016 and after will be part of the validation set. | |
| """ | |
| dataset_dict = load_dataset('HUPD/hupd', | |
| name='sample', | |
| data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", | |
| icpr_label=None, | |
| train_filing_start_date='2016-01-01', | |
| train_filing_end_date='2016-01-21', | |
| val_filing_start_date='2016-01-22', | |
| val_filing_end_date='2016-01-31', | |
| ) | |
| print(dataset_dict) | |
| print(f'Train dataset size: {dataset_dict["train"].shape}') | |
| print(f'Validation dataset size: {dataset_dict["validation"].shape}') | |
| """## Pre-Processing Steps | |
| Our model will only be able to predict rejections or acceptances. We will have to filter out any | |
| other decisions from our training and validation set in order to proceed. | |
| """ | |
| # Label-to-index mapping for the decision status field | |
| decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5} | |
| # Helper function | |
| def map_decision_to_string(example): | |
| return {'decision': decision_to_str[example['decision']]} | |
| # Re-labeling/mapping. | |
| # Filtering out any decisions that are not 'REJECTED' or 'ACCEPTED'. | |
| for name in ['train', 'validation']: | |
| dataset_dict[name] = dataset_dict[name].map(map_decision_to_string) | |
| # Remove the pending and CONT-patent applications | |
| dataset_dict[name] = dataset_dict[name].filter(lambda e: e['decision'] <= 1) | |
| # Save the dataset dictionary to disk | |
| dataset_dict.save_to_disk(file_path + 'dataset_dict') |