File size: 4,812 Bytes
1694d2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""

ViSum - QLoRA Fine-tune BARTpho trên 144K mẫu VietNews (5 Epochs)

GPU: RTX 3090 24GB

Author: OrdinaryAI

"""

from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments, Seq2SeqTrainer,
    DataCollatorForSeq2Seq, BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import torch
import os

# ============================================
# 1. LOAD DATASET (144K MẪU)
# ============================================
print("=" * 60)
print("BƯỚC 1: Load dataset VietNews (144K mẫu)")
print("=" * 60)
dataset = load_dataset("harouzie/vietnews")
train_data = dataset['train']        # 99.134 mẫu
val_data = dataset['validation']     # 22.184 mẫu
print(f"Train: {len(train_data)} | Val: {len(val_data)}")

# ============================================
# 2. TOKENIZE
# ============================================
print("\n" + "=" * 60)
print("BƯỚC 2: Tokenize")
print("=" * 60)
MODEL_NAME = "vinai/bartpho-syllable"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(examples):
    inputs = tokenizer(examples['article'], max_length=512, truncation=True)
    labels = tokenizer(examples['abstract'], max_length=128, truncation=True)
    inputs['labels'] = labels['input_ids']
    return inputs

tokenized_train = train_data.map(
    preprocess, batched=True, 
    remove_columns=train_data.column_names
)
tokenized_val = val_data.map(
    preprocess, batched=True, 
    remove_columns=val_data.column_names
)
print(f"Tokenized: {len(tokenized_train)} train | {len(tokenized_val)} val")

# ============================================
# 3. QLoRA CONFIG + LOAD MODEL
# ============================================
print("\n" + "=" * 60)
print("BƯỚC 3: Load Model với QLoRA (4-bit)")
print("=" * 60)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16, lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "out_proj", "fc1", "fc2"],
    lora_dropout=0.1, bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)
model = get_peft_model(model, lora_config)
print("Trainable parameters: ", end="")
model.print_trainable_parameters()

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# ============================================
# 4. TRAINING (5 EPOCHS)
# ============================================
print("\n" + "=" * 60)
print("BƯỚC 4: Train 5 Epochs")
print("=" * 60)

training_args = Seq2SeqTrainingArguments(
    output_dir="./visum-qlora-5epochs",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=500,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=8,
    learning_rate=2e-4,
    num_train_epochs=5,                     # ← 5 EPOCHS
    predict_with_generate=True,
    generation_max_length=150,
    fp16=True,
    save_total_limit=5,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    processing_class=tokenizer,
    data_collator=data_collator,
)

print("🚀 Bắt đầu train 144K mẫu - 5 epochs...")
print(f"⏱️  Dự kiến: ~24 tiếng (mỗi epoch ~4.8 tiếng)")

trainer.train()

# ============================================
# 5. LƯU MODEL
# ============================================
print("\n" + "=" * 60)
print("BƯỚC 5: Lưu Model")
print("=" * 60)
model.save_pretrained("./visum-qlora-5epochs")
tokenizer.save_pretrained("./visum-qlora-5epochs")
print("✅ Done! Model saved to ./visum-qlora-5epochs")

# ============================================
# 6. THÔNG TIN HUẤN LUYỆN
# ============================================
print("\n" + "=" * 60)
print("THÔNG TIN HUẤN LUYỆN")
print("=" * 60)
print(f"  Model gốc:       {MODEL_NAME}")
print(f"  Dataset:         harouzie/vietnews (144K mẫu)")
print(f"  Phương pháp:     QLoRA (4-bit) + LoRA (r=16, alpha=32)")
print(f"  Epochs:          5")
print(f"  Batch size:      8 × gradient_accumulation 2 = 16")
print(f"  Learning rate:   2e-4")
print(f"  GPU:             NVIDIA GeForce RTX 3090 24GB")
print(f"  Thư mục output:  ./visum-qlora-5epochs")