File size: 2,001 Bytes
7ad55c8
 
 
 
 
 
 
 
 
 
 
7ae3549
7ad55c8
 
 
 
 
 
7ae3549
 
 
 
 
 
 
 
 
 
 
7ad55c8
 
7ae3549
 
 
 
 
7ad55c8
 
7ae3549
 
7ad55c8
7ae3549
 
7ad55c8
7ae3549
7ad55c8
7ae3549
7ad55c8
 
7ae3549
 
 
7ad55c8
 
 
 
 
 
 
7ae3549
 
7ad55c8
 
7ae3549
7ad55c8
7ae3549
7ad55c8
 
 
7ae3549
7ad55c8
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

from config import MODEL_NAME, MAX_LENGTH, DATASET_EN_ES

# Load tokenizer + model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Load dataset
dataset = load_dataset(DATASET_EN_ES)

# -----------------------------
# FIX: proper preprocessing
# -----------------------------
def preprocess(example):
    source = example["term"]["en"]
    target = example["term"]["es"]

    model_inputs = tokenizer(
        source,
        max_length=MAX_LENGTH,
        truncation=True
    )

    # IMPORTANT FIX: use text_target (correct way for seq2seq)
    labels = tokenizer(
        text_target=target,
        max_length=MAX_LENGTH,
        truncation=True
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess, remove_columns=dataset["train"].column_names)

# -----------------------------
# Data collator
# -----------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# -----------------------------
# Training arguments
# -----------------------------
training_args = Seq2SeqTrainingArguments(
    output_dir="./my-translation-model",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_steps=50,
    evaluation_strategy="no",
    fp16=True  # faster if GPU supports it
)

# -----------------------------
# Trainer
# -----------------------------
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train
trainer.train()

# Save model
model.save_pretrained("./my-translation-model")
tokenizer.save_pretrained("./my-translation-model")