Spaces:
Sleeping
Sleeping
File size: 4,812 Bytes
1694d2d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | """
ViSum - QLoRA Fine-tune BARTpho trên 144K mẫu VietNews (5 Epochs)
GPU: RTX 3090 24GB
Author: OrdinaryAI
"""
from datasets import load_dataset
from transformers import (
AutoTokenizer, AutoModelForSeq2SeqLM,
Seq2SeqTrainingArguments, Seq2SeqTrainer,
DataCollatorForSeq2Seq, BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import torch
import os
# ============================================
# 1. LOAD DATASET (144K MẪU)
# ============================================
print("=" * 60)
print("BƯỚC 1: Load dataset VietNews (144K mẫu)")
print("=" * 60)
dataset = load_dataset("harouzie/vietnews")
train_data = dataset['train'] # 99.134 mẫu
val_data = dataset['validation'] # 22.184 mẫu
print(f"Train: {len(train_data)} | Val: {len(val_data)}")
# ============================================
# 2. TOKENIZE
# ============================================
print("\n" + "=" * 60)
print("BƯỚC 2: Tokenize")
print("=" * 60)
MODEL_NAME = "vinai/bartpho-syllable"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def preprocess(examples):
inputs = tokenizer(examples['article'], max_length=512, truncation=True)
labels = tokenizer(examples['abstract'], max_length=128, truncation=True)
inputs['labels'] = labels['input_ids']
return inputs
tokenized_train = train_data.map(
preprocess, batched=True,
remove_columns=train_data.column_names
)
tokenized_val = val_data.map(
preprocess, batched=True,
remove_columns=val_data.column_names
)
print(f"Tokenized: {len(tokenized_train)} train | {len(tokenized_val)} val")
# ============================================
# 3. QLoRA CONFIG + LOAD MODEL
# ============================================
print("\n" + "=" * 60)
print("BƯỚC 3: Load Model với QLoRA (4-bit)")
print("=" * 60)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map="auto",
)
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
r=16, lora_alpha=32,
target_modules=["q_proj", "v_proj", "k_proj", "out_proj", "fc1", "fc2"],
lora_dropout=0.1, bias="none",
task_type=TaskType.SEQ_2_SEQ_LM,
)
model = get_peft_model(model, lora_config)
print("Trainable parameters: ", end="")
model.print_trainable_parameters()
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# ============================================
# 4. TRAINING (5 EPOCHS)
# ============================================
print("\n" + "=" * 60)
print("BƯỚC 4: Train 5 Epochs")
print("=" * 60)
training_args = Seq2SeqTrainingArguments(
output_dir="./visum-qlora-5epochs",
eval_strategy="epoch",
save_strategy="epoch",
logging_steps=500,
per_device_train_batch_size=8,
gradient_accumulation_steps=2,
per_device_eval_batch_size=8,
learning_rate=2e-4,
num_train_epochs=5, # ← 5 EPOCHS
predict_with_generate=True,
generation_max_length=150,
fp16=True,
save_total_limit=5,
report_to="none",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
processing_class=tokenizer,
data_collator=data_collator,
)
print("🚀 Bắt đầu train 144K mẫu - 5 epochs...")
print(f"⏱️ Dự kiến: ~24 tiếng (mỗi epoch ~4.8 tiếng)")
trainer.train()
# ============================================
# 5. LƯU MODEL
# ============================================
print("\n" + "=" * 60)
print("BƯỚC 5: Lưu Model")
print("=" * 60)
model.save_pretrained("./visum-qlora-5epochs")
tokenizer.save_pretrained("./visum-qlora-5epochs")
print("✅ Done! Model saved to ./visum-qlora-5epochs")
# ============================================
# 6. THÔNG TIN HUẤN LUYỆN
# ============================================
print("\n" + "=" * 60)
print("THÔNG TIN HUẤN LUYỆN")
print("=" * 60)
print(f" Model gốc: {MODEL_NAME}")
print(f" Dataset: harouzie/vietnews (144K mẫu)")
print(f" Phương pháp: QLoRA (4-bit) + LoRA (r=16, alpha=32)")
print(f" Epochs: 5")
print(f" Batch size: 8 × gradient_accumulation 2 = 16")
print(f" Learning rate: 2e-4")
print(f" GPU: NVIDIA GeForce RTX 3090 24GB")
print(f" Thư mục output: ./visum-qlora-5epochs")
|