|
|
|
|
| import os |
| import sys |
| from datasets import load_dataset, load_from_disk, concatenate_datasets |
| from transformers import PreTrainedTokenizerFast |
| import transformers |
| from transformers import ( |
| AutoConfig, |
| AutoModelForCausalLM, |
| Trainer, |
| TrainingArguments, |
| default_data_collator, |
| ) |
| from transformers.trainer_utils import get_last_checkpoint |
| from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel |
|
|
| from transformers import GPT2Model |
| from transformers import GPT2TokenizerFast |
| import transformers |
| import torch |
| import numpy as np |
| import argparse |
|
|
| parser = argparse.ArgumentParser() |
| parser.add_argument('test', type=int) |
| parser.add_argument('length', type=int) |
| |
| args = parser.parse_args() |
|
|
| def compute_metrics(eval_pred): |
| logits,labels = eval_pred |
| import pickle |
| with open("logits_{}.pickle".format("xed"),"wb") as handle: |
| pickle.dump(logits, handle, protocol=pickle.HIGHEST_PROTOCOL) |
| with open("labels_{}.pickle".format("xed"),"wb") as handle: |
| pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL) |
| |
|
|
| return |
|
|
|
|
|
|
| class MultilabelTrainer(Trainer): |
| def compute_loss(self,model,inputs,return_outputs=False): |
| labels = inputs.pop("labels") |
| outputs = model(**inputs) |
| logits = outputs.logits |
| loss_fct = torch.nn.BCEWithLogitsLoss() |
| loss = loss_fct(logits.view(-1,self.model.config.num_labels), |
| labels.float().view(-1,self.model.config.num_labels)) |
| return (loss,outputs) if return_outputs else loss |
|
|
| def main(): |
| ds_names = ["yle", "online_review","xed","ylilauta"] |
| |
| print("test:",args.test) |
| ds_name = ds_names[args.test] |
| |
| ds_size = args.length |
| print(ds_name, ds_size) |
| |
| metric = compute_metrics |
|
|
| |
| |
| |
| |
| output_dir = "/data/loc/"+ds_name |
|
|
| |
| training_args = TrainingArguments( |
| output_dir=output_dir, |
| per_device_train_batch_size=4, |
| per_device_eval_batch_size=4, |
| learning_rate=5e-6, |
| adam_beta1=0.95, |
| adam_beta2=0.985, |
| adam_epsilon=1e-8, |
| weight_decay=0.001, |
| lr_scheduler_type="linear", |
| gradient_accumulation_steps=4, |
| max_steps=10000, |
| num_train_epochs=20000, |
| save_total_limit=2, |
| dataloader_num_workers=5, |
| save_steps=100000, |
| warmup_steps=500, |
| do_eval=True, |
| eval_steps=500, |
| evaluation_strategy="steps", |
| logging_strategy="steps", |
| logging_steps=50, |
| fp16_opt_level="O2", |
| half_precision_backend="amp", |
| log_on_each_node=False, |
| disable_tqdm=True |
| ) |
|
|
| print(training_args) |
|
|
| dataset = load_from_disk(r"/data_loc/"+ds_name)["test"] |
| |
| |
| trainer_class = MultilabelTrainer |
| |
| |
| model = AutoModelForSequenceClassification.from_pretrained("/fine_tuning_checkpoint/"+ds_name) |
| tokenizer = AutoTokenizer.from_pretrained("/fine_tuning_checkpoint/"+ds_name) |
| tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) |
|
|
| print("init trainer") |
| trainer = trainer_class( |
| model=model, |
| args=training_args, |
| train_dataset=dataset, |
| eval_dataset=dataset, |
| tokenizer=tokenizer, |
| compute_metrics=metric, |
| data_collator=default_data_collator |
| ) |
| |
| |
| |
| |
| |
| metrics = trainer.evaluate() |
| print(metrics) |
| |
|
|
| if __name__ == "__main__": |
| main() |
|
|