Ryan Kim
adding these files as a backup of an older project that got mangled by Git LFS's size limit
6410115
| from datasets import load_dataset | |
| import pandas as pd | |
| import numpy as np | |
| import os | |
| import json | |
| import torch | |
| import sys | |
| from torch.utils.data import Dataset, DataLoader | |
| from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification | |
| from transformers import Trainer, TrainingArguments, AdamW | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| model_name = "distilbert-base-uncased" | |
| upsto_abstracts_model_path = './models/uspto_abstracts' | |
| upsto_claims_model_path = './models/uspto_claims' | |
| class USPTODataset(Dataset): | |
| def __init__(self, encodings, labels): | |
| self.encodings = encodings | |
| self.labels = labels | |
| def __getitem__(self, idx): | |
| item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | |
| item['labels'] = torch.tensor(self.labels[idx]) | |
| return item | |
| def __len__(self): | |
| return len(self.labels) | |
| def LoadDataset(): | |
| print("=== LOADING THE DATASET ===") | |
| # Extracting the dataset, filtering only for Jan. 2016 | |
| dataset_dict = load_dataset('HUPD/hupd', | |
| name='sample', | |
| data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", | |
| icpr_label=None, | |
| train_filing_start_date='2016-01-01', | |
| train_filing_end_date='2016-01-21', | |
| val_filing_start_date='2016-01-22', | |
| val_filing_end_date='2016-01-31', | |
| ) | |
| print("Separating between training and validation data") | |
| df_train = pd.DataFrame(dataset_dict['train'] ) | |
| df_val = pd.DataFrame(dataset_dict['validation'] ) | |
| print("=== PRE-PROCESSING THE DATASET ===") | |
| #We are interested in the following columns: | |
| # - Abstract | |
| # - Claims | |
| # - Decision <- our `y` | |
| # Let's preprocess them both out of our training and validation data | |
| # Also, consider that the "Decision" column has three types of values: "Accepted", "Rejected", and "Pending". To remove unecessary baggage, we will be only looking for "Accepted" and "Rejected". | |
| necessary_columns = ["abstract","claims","decision"] | |
| output_values = ['ACCEPTED','REJECTED'] | |
| print("Dropping unused columns") | |
| trainFeaturesToDrop = [col for col in list(df_train.columns) if col not in necessary_columns] | |
| trainDF = df_train.dropna() | |
| trainDF.drop(columns=trainFeaturesToDrop, inplace=True) | |
| trainDF = trainDF[trainDF['decision'].isin(output_values)] | |
| valFeaturesToDrop = [col for col in list(df_val.columns) if col not in necessary_columns] | |
| valDF = df_val.dropna() | |
| valDF.drop(columns=valFeaturesToDrop, inplace=True) | |
| valDF = valDF[valDF['decision'].isin(output_values)] | |
| # We need to replace the values in the `decision` column to numerical representations. ] | |
| # We will set "ACCEPTED" as `1` and "REJECTED" as `0`. | |
| print("Replacing values in `decision` column") | |
| yKey = {"ACCEPTED":1,"REJECTED":0} | |
| trainDF2 = trainDF.replace({"decision": yKey}) | |
| valDF2 = valDF.replace({"decision": yKey}) | |
| # We re-label the `decision` column to `label`. | |
| print("Renaming `decision` to `label`") | |
| trainDF3 = trainDF2.rename(columns={'decision': 'label'}) | |
| valDF3 = valDF2.rename(columns={'decision': 'label'}) | |
| # We can grab the data for each column so that we have a list of values for training labels, | |
| # training texts, validation labels, and validation texts. | |
| print("Extracting label and text data from dataframes") | |
| trainData = { | |
| "labels":trainDF3["label"].tolist(), | |
| "abstracts":trainDF3["abstract"].tolist(), | |
| "claims":trainDF3["claims"].tolist(), | |
| } | |
| valData = { | |
| "labels":valDF3["label"].tolist(), | |
| "abstracts":valDF3["abstract"].tolist(), | |
| "claims":valDF3["claims"].tolist(), | |
| } | |
| #print(f'TRAINING:\t# labels: {len(trainData["labels"])}\t# texts: {len(trainData["text"])}') | |
| #print(f'VALID:\t# labels: {len(valData["labels"])}\t# texts: {len(valData["text"])}') | |
| if not os.path.exists("./data"): | |
| os.makedirs('./data') | |
| with open("./data/train.json", "w") as outfile: | |
| json.dump(trainData, outfile, indent=2) | |
| with open("./data/val.json", "w") as outfile: | |
| json.dump(valData, outfile, indent=2) | |
| return trainData, valData | |
| def TrainModel(trainData, valData): | |
| print("=== ENCODING DATA ===") | |
| #print(len(trainData["labels"]), len(trainData["text"]), len(valData["labels"]), len(valData["text"])) | |
| print("\t- initializing tokenizer") | |
| tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) | |
| print("\t- encoding training data") | |
| train_abstracts_encodings = tokenizer(trainData["abstracts"], truncation=True, padding=True) | |
| train_claims_encodings = tokenizer(trainData["claims"], truncation=True, padding=True) | |
| #print("\t- encoding validation data") | |
| #val_abstracts_encodings = tokenizer(valData["abstracts"], truncation=True, padding=True) | |
| #val_claims_encodings = tokenizer(valData["claims"], truncation=True, padding=True) | |
| print(trainData["abstracts"][:10]) | |
| print(trainData["labels"][:10]) | |
| print("=== CREATING DATASETS ===") | |
| print("\t- initializing dataset for training data") | |
| train_abstracts_dataset = USPTODataset(train_abstracts_encodings, trainData["labels"]) | |
| train_claims_dataset = USPTODataset(train_claims_encodings, trainData["labels"]) | |
| #print("\t- initializing dataset for validation data") | |
| #val_abstracts_dataset = USPTODataset(val_abstracts_encodings, valData["labels"]) | |
| #val_claims_dataset = USPTODataset(val_claims_encodings, valData["labels"]) | |
| print("=== PREPARING MODEL ===") | |
| print("\t- setting up device") | |
| device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') | |
| print("\t- initializing model") | |
| model = DistilBertForSequenceClassification.from_pretrained(model_name) | |
| model.to(device) | |
| model.train() | |
| print("== PREPARING TRAINING ===") | |
| print("\t- initializing trainers") | |
| train_abstracts_loader = DataLoader(train_abstracts_dataset, batch_size=4, shuffle=True) | |
| train_claims_loader = DataLoader(train_claims_dataset, batch_size=4, shuffle=True) | |
| #train_claims_loader = DataLoader(train_claims_dataset, batch_size=4, shuffle=True) | |
| print("\t- initializing optim") | |
| optim = AdamW(model.parameters(), lr=5e-5) | |
| def Train(loader, save_path, num_train_epochs=2): | |
| batch_num = len(loader) | |
| for epoch in range(num_train_epochs): | |
| print(f'\t- Training epoch {epoch+1}/{num_train_epochs}') | |
| batch_count = 0 | |
| for batch in loader: | |
| print(f'{batch_count}|{batch_num} - {round((batch_count/batch_num)*100)}%', end="") | |
| #print('\t\t- optim zero grad') | |
| optim.zero_grad() | |
| #print('\t\t- input_ids') | |
| input_ids = batch['input_ids'].to(device) | |
| #print('\t\t- attention_mask') | |
| attention_mask = batch['attention_mask'].to(device) | |
| #print('\t\t- labels0') | |
| labels = batch['labels'].to(device) | |
| #print('\t\t- outputs') | |
| outputs = model(input_ids, attention_mask=attention_mask, labels=labels) | |
| #print('\t\t- loss') | |
| loss = outputs[0] | |
| #print('\t\t- backwards') | |
| loss.backward() | |
| #print('\t\t- step') | |
| optim.step() | |
| batch_count += 1 | |
| print("\r", end="") | |
| model.eval() | |
| model.save_pretrained(save_path, from_pt=True) | |
| print(f'Saved model in {save_path}!') | |
| print("=== TRAINING ABSTRACTS ===") | |
| Train(train_abstracts_loader,upsto_abstracts_model_path) | |
| print("=== TRAINING CLAIMS ===") | |
| Train(train_claims_loader,upsto_claims_model_path) | |
| def main(): | |
| trainDataPath = "./data/train.json" | |
| valDataPath = "./data/val.json" | |
| trainData = None | |
| valData = None | |
| if os.path.exists(trainDataPath) and os.path.exists(valDataPath): | |
| print("Loading from existing data files") | |
| ftrain = open(trainDataPath) | |
| trainData = json.load(ftrain) | |
| ftrain.close() | |
| fval = open(valDataPath) | |
| valData = json.load(fval) | |
| fval.close() | |
| else: | |
| trainData, valData = LoadDataset() | |
| #print(len(trainData["labels"]), len(trainData["text"]), len(valData["labels"]), len(valData["text"])) | |
| print("Data loaded successfully!") | |
| TrainModel(trainData, valData) | |
| """ | |
| train_args = TrainingArguments( | |
| output_dir="./results", | |
| num_train_epochs=2, | |
| per_device_train_batch_size=16, | |
| per_device_eval_batch_size=64, | |
| warmup_steps=500, | |
| learning_rate=5e-5, | |
| weight_decay=0.01, | |
| logging_dir="./logs", | |
| logging_steps=10 | |
| ) | |
| model = DistilBertForSequenceClassification.from_pretrained(model_name) | |
| trainer = Trainer( | |
| model=model, | |
| args=train_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=val_dataset | |
| ) | |
| trainer.train() | |
| """ | |
| if __name__ == "__main__": | |
| main() |