| |
|
| |
|
| | import os |
| | import sys |
| | from datasets import load_dataset, concatenate_datasets |
| | from transformers import PreTrainedTokenizerFast |
| | import transformers |
| | from transformers import ( |
| | AutoConfig, |
| | AutoModelForCausalLM, |
| | Trainer, |
| | TrainingArguments, |
| | default_data_collator, |
| | GPT2Tokenizer |
| | ) |
| | from transformers.trainer_utils import get_last_checkpoint |
| | from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel |
| |
|
| | from transformers import GPT2Model |
| | from transformers import GPT2TokenizerFast |
| | import transformers |
| | import torch |
| | import numpy as np |
| | import argparse |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained("/checkpoint/loc") |
| | tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) |
| | out_dir = "/out_dir/xed" |
| | max_length = 1024 |
| |
|
| |
|
| | fi_annotated_raw = load_dataset("xed_en_fi","fi_annotated") |
| | fi_neutral_raw = load_dataset("xed_en_fi","fi_neutral") |
| |
|
| | def to_arr(examples): |
| | labels = [] |
| | for item in examples["labels"]: |
| | labels.append([item]) |
| | return {"sentence":examples["sentence"],"labels":labels} |
| | fi_neutral_mapped = fi_neutral_raw["train"].map(to_arr, batched=True) |
| |
|
| | fi_neutral_mapped_cast = fi_neutral_mapped.cast(fi_annotated_raw["train"].features) |
| | concat_raw_set = concatenate_datasets([fi_neutral_mapped_cast, fi_annotated_raw["train"]]) |
| |
|
| | def tokenize_function(examples): |
| | return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=max_length) |
| |
|
| | def to_arr_2(examples): |
| | labels = [] |
| | for item in examples["labels"]: |
| | label = np.zeros(9) |
| | label[item] = 1 |
| | labels.append(label.tolist()) |
| | return {"sentence":examples["sentence"],"labels":labels} |
| |
|
| | tokenized_datasets = concat_raw_set.map(tokenize_function, batched=True).map(to_arr_2, batched=True).shuffle(seed=42).train_test_split(test_size=0.1) |
| | tokenized_datasets.save_to_disk(out_dir) |