Spaces:
Runtime error
Runtime error
| from datasets import load_dataset, DatasetDict, Dataset | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoConfig, | |
| AutoModelForSequenceClassification, | |
| DataCollatorWithPadding, | |
| TrainingArguments, | |
| Trainer) | |
| from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig | |
| import evaluate | |
| import torch | |
| import numpy as np | |
| # load dataset | |
| dataset = load_dataset('shawhin/imdb-truncated') | |
| # display % of training data with label=1 | |
| np.array(dataset['train']['label']).sum()/len(dataset['train']['label']) | |
| model_checkpoint = 'distilbert-base-uncased' | |
| # model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer | |
| # define label maps | |
| id2label = {0: "Negative", 1: "Positive"} | |
| label2id = {"Negative":0, "Positive":1} | |
| # generate classification model from model_checkpoint | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id) | |
| # create tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True) | |
| # add pad token if none exists | |
| if tokenizer.pad_token is None: | |
| tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
| model.resize_token_embeddings(len(tokenizer)) | |
| # create tokenize function | |
| def tokenize_function(examples): | |
| # extract text | |
| text = examples["text"] | |
| #tokenize and truncate text | |
| tokenizer.truncation_side = "left" | |
| tokenized_inputs = tokenizer( | |
| text, | |
| return_tensors="np", | |
| truncation=True, | |
| max_length=512 | |
| ) | |
| return tokenized_inputs | |
| # tokenize training and validation datasets | |
| tokenized_dataset = dataset.map(tokenize_function, batched=True) | |
| # create data collator | |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
| # import accuracy evaluation metric | |
| accuracy = evaluate.load("accuracy") | |
| # define an evaluation function to pass into trainer later | |
| def compute_metrics(p): | |
| predictions, labels = p | |
| predictions = np.argmax(predictions, axis=1) | |
| return {"accuracy": accuracy.compute(predictions=predictions, references=labels)} | |
| # define list of examples | |
| text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."] | |
| print("Untrained model predictions:") | |
| print("----------------------------") | |
| for text in text_list: | |
| # tokenize text | |
| inputs = tokenizer.encode(text, return_tensors="pt") | |
| # compute logits | |
| logits = model(inputs).logits | |
| # convert logits to label | |
| predictions = torch.argmax(logits) | |
| print(text + " - " + id2label[predictions.tolist()]) | |
| peft_config = LoraConfig(task_type="SEQ_CLS", | |
| r=4, | |
| lora_alpha=32, | |
| lora_dropout=0.01, | |
| target_modules = ['q_lin']) | |
| model = get_peft_model(model, peft_config) | |
| model.print_trainable_parameters() | |
| # hyperparameters | |
| lr = 1e-3 | |
| batch_size = 4 | |
| num_epochs = 10 | |
| # define training arguments | |
| training_args = TrainingArguments( | |
| output_dir= model_checkpoint + "-lora-text-classification", | |
| learning_rate=lr, | |
| per_device_train_batch_size=batch_size, | |
| per_device_eval_batch_size=batch_size, | |
| num_train_epochs=num_epochs, | |
| weight_decay=0.01, | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch", | |
| load_best_model_at_end=True, | |
| ) | |
| # creater trainer object | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset["train"], | |
| eval_dataset=tokenized_dataset["validation"], | |
| tokenizer=tokenizer, | |
| data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length | |
| compute_metrics=compute_metrics, | |
| ) | |
| # train model | |
| trainer.train() | |
| model.to('mps') # moving to mps for Mac (can alternatively do 'cpu') | |
| print("Trained model predictions:") | |
| print("--------------------------") | |
| for text in text_list: | |
| inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu') | |
| logits = model(inputs).logits | |
| predictions = torch.max(logits,1).indices | |
| print(text + " - " + id2label[predictions.tolist()[0]]) | |
| # option 1: notebook login | |
| from huggingface_hub import notebook_login | |
| notebook_login() # ensure token gives write access | |
| hf_name = 'laxmisahu' # your hf username or org name | |
| model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want | |
| # how to load peft model from hub for inference | |
| config = PeftConfig.from_pretrained(model_id) | |
| inference_model = AutoModelForSequenceClassification.from_pretrained( | |
| config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) | |
| model = PeftModel.from_pretrained(inference_model, model_id) |