| |
|
| |
|
| | import os |
| | import sys |
| | from datasets import load_dataset, load_from_disk, concatenate_datasets |
| | from transformers import PreTrainedTokenizerFast |
| | import transformers |
| | from transformers import ( |
| | AutoConfig, |
| | AutoModelForCausalLM, |
| | Trainer, |
| | TrainingArguments, |
| | default_data_collator, |
| | ) |
| | from transformers.trainer_utils import get_last_checkpoint |
| | from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel |
| |
|
| | from transformers import GPT2Model |
| | from transformers import GPT2TokenizerFast |
| | import transformers |
| | import torch |
| | import numpy as np |
| | import argparse |
| |
|
| | parser = argparse.ArgumentParser() |
| | parser.add_argument('test', type=int) |
| | parser.add_argument('length', type=int) |
| | |
| | args = parser.parse_args() |
| |
|
| | def compute_metrics(eval_pred): |
| | logits,labels = eval_pred |
| | import pickle |
| | with open("logits_{}.pickle".format("xed"),"wb") as handle: |
| | pickle.dump(logits, handle, protocol=pickle.HIGHEST_PROTOCOL) |
| | with open("labels_{}.pickle".format("xed"),"wb") as handle: |
| | pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL) |
| | |
| |
|
| | return |
| |
|
| |
|
| |
|
| | class MultilabelTrainer(Trainer): |
| | def compute_loss(self,model,inputs,return_outputs=False): |
| | labels = inputs.pop("labels") |
| | outputs = model(**inputs) |
| | logits = outputs.logits |
| | loss_fct = torch.nn.BCEWithLogitsLoss() |
| | loss = loss_fct(logits.view(-1,self.model.config.num_labels), |
| | labels.float().view(-1,self.model.config.num_labels)) |
| | return (loss,outputs) if return_outputs else loss |
| |
|
| | def main(): |
| | ds_names = ["yle", "online_review","xed","ylilauta"] |
| | |
| | print("test:",args.test) |
| | ds_name = ds_names[args.test] |
| | |
| | ds_size = args.length |
| | print(ds_name, ds_size) |
| | |
| | metric = compute_metrics |
| |
|
| | |
| | |
| | |
| | |
| | output_dir = "/data/loc/"+ds_name |
| |
|
| | |
| | training_args = TrainingArguments( |
| | output_dir=output_dir, |
| | per_device_train_batch_size=4, |
| | per_device_eval_batch_size=4, |
| | learning_rate=5e-6, |
| | adam_beta1=0.95, |
| | adam_beta2=0.985, |
| | adam_epsilon=1e-8, |
| | weight_decay=0.001, |
| | lr_scheduler_type="linear", |
| | gradient_accumulation_steps=4, |
| | max_steps=10000, |
| | num_train_epochs=20000, |
| | save_total_limit=2, |
| | dataloader_num_workers=5, |
| | save_steps=100000, |
| | warmup_steps=500, |
| | do_eval=True, |
| | eval_steps=500, |
| | evaluation_strategy="steps", |
| | logging_strategy="steps", |
| | logging_steps=50, |
| | fp16_opt_level="O2", |
| | half_precision_backend="amp", |
| | log_on_each_node=False, |
| | disable_tqdm=True |
| | ) |
| |
|
| | print(training_args) |
| |
|
| | dataset = load_from_disk(r"/data_loc/"+ds_name)["test"] |
| | |
| | |
| | trainer_class = MultilabelTrainer |
| | |
| | |
| | model = AutoModelForSequenceClassification.from_pretrained("/fine_tuning_checkpoint/"+ds_name) |
| | tokenizer = AutoTokenizer.from_pretrained("/fine_tuning_checkpoint/"+ds_name) |
| | tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) |
| |
|
| | print("init trainer") |
| | trainer = trainer_class( |
| | model=model, |
| | args=training_args, |
| | train_dataset=dataset, |
| | eval_dataset=dataset, |
| | tokenizer=tokenizer, |
| | compute_metrics=metric, |
| | data_collator=default_data_collator |
| | ) |
| | |
| | |
| | |
| | |
| | |
| | metrics = trainer.evaluate() |
| | print(metrics) |
| | |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|