| | from transformers import Trainer, TrainingArguments |
| | from datasets import Dataset |
| | import json |
| | from modeling_tinytransformer import TinyTransformerModel |
| | from configuration_tinytransformer import TinyTransformerConfig |
| | from tokenization_tinytransformer import TinyTokenizer |
| |
|
| | |
| | data = [] |
| | with open("data/train_data.jsonl", "r", encoding="utf-8") as f: |
| | for line in f: |
| | data.append(json.loads(line)) |
| |
|
| | dataset = Dataset.from_list(data) |
| |
|
| | |
| | tokenizer = TinyTokenizer() |
| |
|
| | def preprocess(examples): |
| | encodings = tokenizer(examples["text"], truncation=True, max_length=64, padding="max_length") |
| | encodings["labels"] = examples["label"] |
| | return encodings |
| |
|
| | tokenized_dataset = dataset.map(preprocess, batched=True) |
| |
|
| | |
| | config = TinyTransformerConfig(vocab_size=tokenizer.vocab_size, num_labels=2) |
| | model = TinyTransformerModel(config) |
| |
|
| | |
| | training_args = TrainingArguments( |
| | output_dir="./results", |
| | num_train_epochs=3, |
| | per_device_train_batch_size=4, |
| | logging_steps=1, |
| | save_strategy="no", |
| | report_to="none", |
| | ) |
| |
|
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=tokenized_dataset, |
| | ) |
| |
|
| | trainer.train() |
| |
|
| | |
| |
|
| | model.save_pretrained("./tiny-sentiment-model") |
| | tokenizer.save_pretrained("./tiny-sentiment-model") |
| | config.save_pretrained("./tiny-sentiment-model") |