task2file-llm / trainer-kit /DPO /config_dpo.yaml
SirajRLX's picture
Upload folder using huggingface_hub
4eae728 verified
run:
run_dir: "./runs/dpo_run_24b_v1"
seed: 42
# WandB integration for experiment tracking
wandb:
enabled: true
project: "dpo-training"
entity: null
name: null
tags: ["dpo-lora", "preference-optimization"]
notes: null
model:
# Use the SFT model as base
repo_id: "../../Models/Devstral-Small-2-24B-HS-CPT-SFT"
revision: null
# Used only when repo_id is a HF repo (not a local path)
base_local_dir: "base_model"
trust_remote_code: true
tokenizer_use_fast: true
device_map: "auto"
torch_dtype: "bfloat16" # "float16" | "bfloat16" | "float32"
# QLoRA
use_4bit: false
bnb_4bit_quant_type: "nf4"
bnb_4bit_use_double_quant: false
bnb_4bit_compute_dtype: "bfloat16"
# optional: "flash_attention_2" | "sdpa" | null
attn_implementation: null
data:
train_jsonl: "dpo_pairs_generated.jsonl"
eval_jsonl: null
eval_split_ratio: 0.1
# Field names in your JSONL data for DPO
# DPO requires: prompt, chosen, rejected
prompt_field: "prompt"
chosen_field: "chosen"
rejected_field: "rejected"
# If you have a file-level F1 score field for ranking
score_field: "f1_score" # Optional: used for ranking if available
# Formatting options
format_type: "chatml" # "chatml" | "alpaca" | "custom"
# System prompt to prepend to all prompts
system_prompt: |
You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.
## Output Format
##OUTPUT
Explain the data flow and why each component must change:
- Flow: [Input Processing Output with arrows]
- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"
- Explain coupling between components
##SELECT
modify::crates/path/to/file.rs::impl::ComponentName
add::crates/another/file.rs::function::AnotherComponent
<EOS>
## Rules
1. Use full paths: `remove::crates/folder/file.rs::Type::Name`
2. Use `::` for nested items: `status::StructName::Type::Name`
3. Always explain "must change because" and "without this"
3. Types of components: function, struct, enum, impl, trait
4. If there is extra information (e.g., enum variants), include that too.
5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>
max_length: 2048
shuffle: true
num_proc: 4
peft:
enabled: true
r: 16
lora_alpha: 32
lora_dropout: 0.05
bias: "none"
target_modules: "auto"
# DPO specific parameters
dpo:
beta: 0.1 # Temperature parameter for DPO loss (higher = less aggressive)
label_smoothing: 0.0 # Label smoothing for DPO
loss_type: "sigmoid" # "sigmoid" | "hinge" | "ipo" | "kto"
# Reference model settings
use_reference_model: true # If false, uses frozen copy of initial model
reference_free: false # If true, doesn't use reference model at all
train:
num_train_epochs: 3
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 5e-5 # Lower than SFT for stability
weight_decay: 0.0
warmup_ratio: 0.1
lr_scheduler_type: "cosine"
optim: "adamw_torch"
max_grad_norm: 1.0
gradient_checkpointing: true
logging_steps: 2
save_strategy: "steps"
save_steps: 100
save_total_limit: 10
evaluation_strategy: "steps"
eval_steps: 25
load_best_model_at_end: true
# Early stopping
early_stopping:
enabled: true
patience: 5
min_delta: 0.001
metric: "eval_loss"
mode: "min"
resume_from_checkpoint: "auto"
merge:
enabled: true
merged_dtype: "float16"
max_shard_size: "2GB"
output_dir: "./merged_14b_dpo_lora"