|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
DPO Training Script for Qwen3-0.6B on n8n Workflow Reasoning |
|
|
|
|
|
This script fine-tunes Qwen3-0.6B using Direct Preference Optimization (DPO) |
|
|
to improve reasoning quality when generating n8n workflows. |
|
|
|
|
|
The dataset contains: |
|
|
- prompt: task description for generating n8n workflow |
|
|
- chosen: high-quality response with detailed <thinking> reasoning |
|
|
- rejected: low-quality response with superficial reasoning or errors |
|
|
|
|
|
Usage: |
|
|
hf jobs uv run \ |
|
|
--script train_qwen3_dpo_reasoning.py \ |
|
|
--flavor l40sx1 \ |
|
|
--name qwen3-dpo-reasoning \ |
|
|
--timeout 12h |
|
|
""" |
|
|
|
|
|
import os |
|
|
import torch |
|
|
from datasets import load_dataset |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
from peft import LoraConfig |
|
|
from trl import DPOConfig, DPOTrainer |
|
|
from huggingface_hub import login |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_NAME = os.environ.get("BASE_MODEL", "Qwen/Qwen3-0.6B") |
|
|
|
|
|
|
|
|
DATASET_REPO = "stmasson/n8n-workflows-thinking" |
|
|
DATA_DIR = "data/dpo" |
|
|
|
|
|
|
|
|
OUTPUT_DIR = "./qwen3-dpo-reasoning" |
|
|
HF_REPO = os.environ.get("HF_REPO", "stmasson/qwen3-0.6b-n8n-reasoning") |
|
|
|
|
|
|
|
|
NUM_EPOCHS = int(os.environ.get("NUM_EPOCHS", "1")) |
|
|
BATCH_SIZE = int(os.environ.get("BATCH_SIZE", "1")) |
|
|
GRAD_ACCUM = int(os.environ.get("GRAD_ACCUM", "8")) |
|
|
LEARNING_RATE = float(os.environ.get("LEARNING_RATE", "5e-6")) |
|
|
MAX_LENGTH = int(os.environ.get("MAX_LENGTH", "4096")) |
|
|
MAX_PROMPT_LENGTH = int(os.environ.get("MAX_PROMPT_LENGTH", "512")) |
|
|
BETA = float(os.environ.get("BETA", "0.1")) |
|
|
|
|
|
|
|
|
LORA_R = int(os.environ.get("LORA_R", "32")) |
|
|
LORA_ALPHA = int(os.environ.get("LORA_ALPHA", "64")) |
|
|
LORA_DROPOUT = float(os.environ.get("LORA_DROPOUT", "0.05")) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("=" * 60) |
|
|
print("DPO TRAINING - QWEN3-0.6B N8N REASONING") |
|
|
print("=" * 60) |
|
|
|
|
|
hf_token = os.environ.get("HF_TOKEN") |
|
|
if hf_token: |
|
|
login(token=hf_token) |
|
|
print("Authenticated with HuggingFace") |
|
|
else: |
|
|
print("Warning: HF_TOKEN not set, push disabled") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\nLoading model: {MODEL_NAME}") |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_NAME, |
|
|
torch_dtype=torch.bfloat16, |
|
|
attn_implementation="sdpa", |
|
|
device_map="auto", |
|
|
trust_remote_code=True, |
|
|
) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) |
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
tokenizer.padding_side = "left" |
|
|
|
|
|
print(f"Model loaded: {model.config.num_hidden_layers} layers, {model.config.hidden_size} hidden size") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\nLoRA config: r={LORA_R}, alpha={LORA_ALPHA}") |
|
|
|
|
|
peft_config = LoraConfig( |
|
|
r=LORA_R, |
|
|
lora_alpha=LORA_ALPHA, |
|
|
target_modules=[ |
|
|
"q_proj", "k_proj", "v_proj", "o_proj", |
|
|
"gate_proj", "up_proj", "down_proj" |
|
|
], |
|
|
lora_dropout=LORA_DROPOUT, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\nLoading dataset: {DATASET_REPO}") |
|
|
|
|
|
train_dataset = load_dataset(DATASET_REPO, data_dir=DATA_DIR, split="train") |
|
|
eval_dataset = load_dataset(DATASET_REPO, data_dir=DATA_DIR, split="validation") |
|
|
|
|
|
print(f"Train: {len(train_dataset)} examples") |
|
|
print(f"Validation: {len(eval_dataset)} examples") |
|
|
|
|
|
|
|
|
def filter_by_length(example): |
|
|
prompt_len = len(example["prompt"]) |
|
|
chosen_len = len(example["chosen"]) |
|
|
rejected_len = len(example["rejected"]) |
|
|
|
|
|
return (prompt_len + max(chosen_len, rejected_len)) < 50000 |
|
|
|
|
|
train_dataset = train_dataset.filter(filter_by_length) |
|
|
eval_dataset = eval_dataset.filter(filter_by_length) |
|
|
|
|
|
print(f"After filtering - Train: {len(train_dataset)}, Val: {len(eval_dataset)}") |
|
|
|
|
|
|
|
|
print("\nExample prompt:", train_dataset[0]["prompt"][:100], "...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\nTraining configuration:") |
|
|
print(f" - Epochs: {NUM_EPOCHS}") |
|
|
print(f" - Batch size: {BATCH_SIZE}") |
|
|
print(f" - Gradient accumulation: {GRAD_ACCUM}") |
|
|
print(f" - Effective batch size: {BATCH_SIZE * GRAD_ACCUM}") |
|
|
print(f" - Learning rate: {LEARNING_RATE}") |
|
|
print(f" - Max length: {MAX_LENGTH}") |
|
|
print(f" - DPO beta: {BETA}") |
|
|
|
|
|
training_args = DPOConfig( |
|
|
output_dir=OUTPUT_DIR, |
|
|
num_train_epochs=NUM_EPOCHS, |
|
|
per_device_train_batch_size=BATCH_SIZE, |
|
|
per_device_eval_batch_size=BATCH_SIZE, |
|
|
gradient_accumulation_steps=GRAD_ACCUM, |
|
|
learning_rate=LEARNING_RATE, |
|
|
lr_scheduler_type="cosine", |
|
|
warmup_ratio=0.1, |
|
|
weight_decay=0.01, |
|
|
bf16=True, |
|
|
tf32=True, |
|
|
logging_steps=10, |
|
|
save_strategy="steps", |
|
|
save_steps=500, |
|
|
save_total_limit=3, |
|
|
eval_strategy="steps", |
|
|
eval_steps=500, |
|
|
max_length=MAX_LENGTH, |
|
|
max_prompt_length=MAX_PROMPT_LENGTH, |
|
|
beta=BETA, |
|
|
loss_type="sigmoid", |
|
|
gradient_checkpointing=True, |
|
|
gradient_checkpointing_kwargs={"use_reentrant": False}, |
|
|
report_to="none", |
|
|
run_name="qwen3-dpo-reasoning", |
|
|
hub_model_id=HF_REPO if hf_token else None, |
|
|
push_to_hub=bool(hf_token), |
|
|
hub_strategy="checkpoint", |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\nInitializing DPO trainer...") |
|
|
|
|
|
trainer = DPOTrainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=train_dataset, |
|
|
eval_dataset=eval_dataset, |
|
|
peft_config=peft_config, |
|
|
processing_class=tokenizer, |
|
|
) |
|
|
|
|
|
|
|
|
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) |
|
|
total_params = sum(p.numel() for p in model.parameters()) |
|
|
print(f"\nTrainable parameters: {trainable_params:,} / {total_params:,} ({100 * trainable_params / total_params:.2f}%)") |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("STARTING DPO TRAINING") |
|
|
print("=" * 60) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\nSaving model...") |
|
|
trainer.save_model(f"{OUTPUT_DIR}/final") |
|
|
|
|
|
if hf_token: |
|
|
print(f"Pushing to {HF_REPO}...") |
|
|
trainer.push_to_hub() |
|
|
print(f"Model available at: https://huggingface.co/{HF_REPO}") |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("DPO TRAINING COMPLETE") |
|
|
print("=" * 60) |
|
|
|