Spaces:
Sleeping
Sleeping
| """ | |
| ViSum - QLoRA Fine-tune BARTpho trên 144K mẫu VietNews (5 Epochs) | |
| GPU: RTX 3090 24GB | |
| Author: OrdinaryAI | |
| """ | |
| from datasets import load_dataset | |
| from transformers import ( | |
| AutoTokenizer, AutoModelForSeq2SeqLM, | |
| Seq2SeqTrainingArguments, Seq2SeqTrainer, | |
| DataCollatorForSeq2Seq, BitsAndBytesConfig, | |
| ) | |
| from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training | |
| import torch | |
| import os | |
| # ============================================ | |
| # 1. LOAD DATASET (144K MẪU) | |
| # ============================================ | |
| print("=" * 60) | |
| print("BƯỚC 1: Load dataset VietNews (144K mẫu)") | |
| print("=" * 60) | |
| dataset = load_dataset("harouzie/vietnews") | |
| train_data = dataset['train'] # 99.134 mẫu | |
| val_data = dataset['validation'] # 22.184 mẫu | |
| print(f"Train: {len(train_data)} | Val: {len(val_data)}") | |
| # ============================================ | |
| # 2. TOKENIZE | |
| # ============================================ | |
| print("\n" + "=" * 60) | |
| print("BƯỚC 2: Tokenize") | |
| print("=" * 60) | |
| MODEL_NAME = "vinai/bartpho-syllable" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| def preprocess(examples): | |
| inputs = tokenizer(examples['article'], max_length=512, truncation=True) | |
| labels = tokenizer(examples['abstract'], max_length=128, truncation=True) | |
| inputs['labels'] = labels['input_ids'] | |
| return inputs | |
| tokenized_train = train_data.map( | |
| preprocess, batched=True, | |
| remove_columns=train_data.column_names | |
| ) | |
| tokenized_val = val_data.map( | |
| preprocess, batched=True, | |
| remove_columns=val_data.column_names | |
| ) | |
| print(f"Tokenized: {len(tokenized_train)} train | {len(tokenized_val)} val") | |
| # ============================================ | |
| # 3. QLoRA CONFIG + LOAD MODEL | |
| # ============================================ | |
| print("\n" + "=" * 60) | |
| print("BƯỚC 3: Load Model với QLoRA (4-bit)") | |
| print("=" * 60) | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| model = AutoModelForSeq2SeqLM.from_pretrained( | |
| MODEL_NAME, | |
| quantization_config=bnb_config, | |
| device_map="auto", | |
| ) | |
| model = prepare_model_for_kbit_training(model) | |
| lora_config = LoraConfig( | |
| r=16, lora_alpha=32, | |
| target_modules=["q_proj", "v_proj", "k_proj", "out_proj", "fc1", "fc2"], | |
| lora_dropout=0.1, bias="none", | |
| task_type=TaskType.SEQ_2_SEQ_LM, | |
| ) | |
| model = get_peft_model(model, lora_config) | |
| print("Trainable parameters: ", end="") | |
| model.print_trainable_parameters() | |
| data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) | |
| # ============================================ | |
| # 4. TRAINING (5 EPOCHS) | |
| # ============================================ | |
| print("\n" + "=" * 60) | |
| print("BƯỚC 4: Train 5 Epochs") | |
| print("=" * 60) | |
| training_args = Seq2SeqTrainingArguments( | |
| output_dir="./visum-qlora-5epochs", | |
| eval_strategy="epoch", | |
| save_strategy="epoch", | |
| logging_steps=500, | |
| per_device_train_batch_size=8, | |
| gradient_accumulation_steps=2, | |
| per_device_eval_batch_size=8, | |
| learning_rate=2e-4, | |
| num_train_epochs=5, # ← 5 EPOCHS | |
| predict_with_generate=True, | |
| generation_max_length=150, | |
| fp16=True, | |
| save_total_limit=5, | |
| report_to="none", | |
| load_best_model_at_end=True, | |
| metric_for_best_model="eval_loss", | |
| greater_is_better=False, | |
| ) | |
| trainer = Seq2SeqTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_train, | |
| eval_dataset=tokenized_val, | |
| processing_class=tokenizer, | |
| data_collator=data_collator, | |
| ) | |
| print("🚀 Bắt đầu train 144K mẫu - 5 epochs...") | |
| print(f"⏱️ Dự kiến: ~24 tiếng (mỗi epoch ~4.8 tiếng)") | |
| trainer.train() | |
| # ============================================ | |
| # 5. LƯU MODEL | |
| # ============================================ | |
| print("\n" + "=" * 60) | |
| print("BƯỚC 5: Lưu Model") | |
| print("=" * 60) | |
| model.save_pretrained("./visum-qlora-5epochs") | |
| tokenizer.save_pretrained("./visum-qlora-5epochs") | |
| print("✅ Done! Model saved to ./visum-qlora-5epochs") | |
| # ============================================ | |
| # 6. THÔNG TIN HUẤN LUYỆN | |
| # ============================================ | |
| print("\n" + "=" * 60) | |
| print("THÔNG TIN HUẤN LUYỆN") | |
| print("=" * 60) | |
| print(f" Model gốc: {MODEL_NAME}") | |
| print(f" Dataset: harouzie/vietnews (144K mẫu)") | |
| print(f" Phương pháp: QLoRA (4-bit) + LoRA (r=16, alpha=32)") | |
| print(f" Epochs: 5") | |
| print(f" Batch size: 8 × gradient_accumulation 2 = 16") | |
| print(f" Learning rate: 2e-4") | |
| print(f" GPU: NVIDIA GeForce RTX 3090 24GB") | |
| print(f" Thư mục output: ./visum-qlora-5epochs") | |