File size: 2,001 Bytes
7ad55c8 7ae3549 7ad55c8 7ae3549 7ad55c8 7ae3549 7ad55c8 7ae3549 7ad55c8 7ae3549 7ad55c8 7ae3549 7ad55c8 7ae3549 7ad55c8 7ae3549 7ad55c8 7ae3549 7ad55c8 7ae3549 7ad55c8 7ae3549 7ad55c8 7ae3549 7ad55c8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
DataCollatorForSeq2Seq,
Seq2SeqTrainer,
Seq2SeqTrainingArguments
)
from config import MODEL_NAME, MAX_LENGTH, DATASET_EN_ES
# Load tokenizer + model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
# Load dataset
dataset = load_dataset(DATASET_EN_ES)
# -----------------------------
# FIX: proper preprocessing
# -----------------------------
def preprocess(example):
source = example["term"]["en"]
target = example["term"]["es"]
model_inputs = tokenizer(
source,
max_length=MAX_LENGTH,
truncation=True
)
# IMPORTANT FIX: use text_target (correct way for seq2seq)
labels = tokenizer(
text_target=target,
max_length=MAX_LENGTH,
truncation=True
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# Apply preprocessing
tokenized_dataset = dataset.map(preprocess, remove_columns=dataset["train"].column_names)
# -----------------------------
# Data collator
# -----------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# -----------------------------
# Training arguments
# -----------------------------
training_args = Seq2SeqTrainingArguments(
output_dir="./my-translation-model",
learning_rate=2e-5,
per_device_train_batch_size=4,
num_train_epochs=3,
save_strategy="epoch",
logging_steps=50,
evaluation_strategy="no",
fp16=True # faster if GPU supports it
)
# -----------------------------
# Trainer
# -----------------------------
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
tokenizer=tokenizer,
data_collator=data_collator
)
# Train
trainer.train()
# Save model
model.save_pretrained("./my-translation-model")
tokenizer.save_pretrained("./my-translation-model") |