|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Definitive SFT training script for Qwen/Qwen2.5-0.5B-Instruct on the corrected |
|
|
itemsety dataset, loaded directly from a private GitHub repo. |
|
|
|
|
|
This script implements 4-bit QLoRA as specified. |
|
|
""" |
|
|
|
|
|
import subprocess |
|
|
import torch |
|
|
from datasets import load_from_disk |
|
|
from peft import LoraConfig |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments |
|
|
from trl import SFTTrainer |
|
|
|
|
|
|
|
|
|
|
|
GIT_TOKEN = "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" |
|
|
GIT_REPO_URL = f"https://{GIT_TOKEN}@github.com/oliversl1vka/itemsety-qwen-finetuning.git" |
|
|
CLONE_PATH = "/tmp/itemsety-qwen-finetuning" |
|
|
DATASET_PATH = f"{CLONE_PATH}/hf_dataset_enhanced" |
|
|
|
|
|
print(f"π¦ Cloning private dataset from GitHub...") |
|
|
subprocess.run(['git', 'clone', GIT_REPO_URL, CLONE_PATH], check=True) |
|
|
print("β
Git clone complete.") |
|
|
|
|
|
|
|
|
print("π Removing .git directory for security...") |
|
|
subprocess.run(['rm', '-rf', f"{CLONE_PATH}/.git"], check=True) |
|
|
print("β
.git directory removed.") |
|
|
|
|
|
|
|
|
print(f"πΎ Loading dataset from disk at {DATASET_PATH}...") |
|
|
dataset = load_from_disk(DATASET_PATH) |
|
|
train_dataset = dataset["train"] |
|
|
eval_dataset = dataset["validation"] |
|
|
|
|
|
|
|
|
assert len(train_dataset) == 88, f"Expected 88 train examples, got {len(train_dataset)}" |
|
|
assert len(eval_dataset) == 10, f"Expected 10 val examples, got {len(eval_dataset)}" |
|
|
assert 'messages' in train_dataset.column_names, "Missing 'messages' column" |
|
|
print(f"β
Dataset loaded successfully. Train: {len(train_dataset)}, Eval: {len(eval_dataset)}") |
|
|
|
|
|
|
|
|
|
|
|
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" |
|
|
|
|
|
|
|
|
quantization_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
|
bnb_4bit_use_double_quant=True, |
|
|
) |
|
|
|
|
|
print(f"π₯ Loading model '{MODEL_ID}' with 4-bit QLoRA...") |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_ID, |
|
|
quantization_config=quantization_config, |
|
|
device_map="auto" |
|
|
) |
|
|
model.config.use_cache = False |
|
|
model.config.pretraining_tp = 1 |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
tokenizer.padding_side = "right" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
peft_config = LoraConfig( |
|
|
r=16, |
|
|
lora_alpha=32, |
|
|
lora_dropout=0.05, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM", |
|
|
target_modules=[ |
|
|
"q_proj", |
|
|
"k_proj", |
|
|
"v_proj", |
|
|
"o_proj", |
|
|
"gate_proj", |
|
|
"up_proj", |
|
|
"down_proj", |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
|
|
|
output_dir="qwen2.5-0.5b-itemsety-qlora", |
|
|
push_to_hub=True, |
|
|
hub_model_id="OliverSlivka/qwen2.5-0.5b-itemsety-qlora-final", |
|
|
hub_strategy="all_checkpoints", |
|
|
|
|
|
|
|
|
num_train_epochs=3, |
|
|
per_device_train_batch_size=4, |
|
|
gradient_accumulation_steps=4, |
|
|
learning_rate=2e-4, |
|
|
optim="paged_adamw_8bit", |
|
|
|
|
|
|
|
|
logging_steps=5, |
|
|
save_strategy="steps", |
|
|
save_steps=20, |
|
|
save_total_limit=2, |
|
|
|
|
|
|
|
|
eval_strategy="steps", |
|
|
eval_steps=20, |
|
|
|
|
|
|
|
|
warmup_ratio=0.03, |
|
|
lr_scheduler_type="constant", |
|
|
max_grad_norm=0.3, |
|
|
max_steps=-1, |
|
|
|
|
|
|
|
|
report_to="trackio", |
|
|
run_name="qwen-itemsety-qlora-run-final" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
print("π― Initializing SFTTrainer...") |
|
|
trainer = SFTTrainer( |
|
|
model=model, |
|
|
train_dataset=train_dataset, |
|
|
eval_dataset=eval_dataset, |
|
|
peft_config=peft_config, |
|
|
args=training_args, |
|
|
max_length=2048, |
|
|
dataset_text_field="messages", |
|
|
packing=False |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
print("π Starting training...") |
|
|
trainer.train() |
|
|
|
|
|
print("β
Training complete!") |
|
|
print(f"πΎ Model pushed to Hub at: https://huggingface.co/{training_args.hub_model_id}") |
|
|
|
|
|
|
|
|
print("... pushing final adapter one more time.") |
|
|
trainer.push_to_hub() |
|
|
print("β
All done.") |
|
|
|