File size: 4,633 Bytes
e527a65 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | run:
run_dir: "./runs/instruct_run_24b"
seed: 42
# WandB integration for experiment tracking
wandb:
enabled: true # Set to true to enable wandb logging
project: "sft-training" # WandB project name
entity: null # WandB entity/team (optional)
name: null # Run name (optional, will auto-generate if null)
tags: ["sft-lora", "24b-Devstral"] # List of tags for the run (e.g., ["lora", "qlora", "experiment-1"])
notes: null # Run description/notes (optional)
model:
# Use local Qwen2.5-Coder-14B model
repo_id: "./CPT/runs/cpt_run_v1/merged_24b_cpt_lora"
revision: null
# Used only when repo_id is a HF repo (not a local path)
base_local_dir: "base_model"
trust_remote_code: true
tokenizer_use_fast: true
device_map: "auto"
torch_dtype: "bfloat16" # "float16" | "bfloat16" | "float32"
# QLoRA
use_4bit: false
bnb_4bit_quant_type: "nf4"
bnb_4bit_use_double_quant: false
bnb_4bit_compute_dtype: "bfloat16"
# optional: "flash_attention_2" | "sdpa" | null
attn_implementation: null
data:
train_jsonl: "../sft_dataset.jsonl"
eval_jsonl: null
eval_split_ratio: 0.1
# Field names in your JSONL data
instruction_field: "instruction" # This will be the system prompt
input_field: "input" # This is the task description
output_field: "output" # This is the analysis + selection
# Formatting options
format_type: "custom" # "chatml" | "alpaca" | "custom"
# For chatml format
system_prompt: |
You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.
## Output Format
##OUTPUT
Explain the data flow and why each component must change:
- Flow: [Input → Processing → Output with arrows]
- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"
- Explain coupling between components
##SELECT
modify::crates/path/to/file.rs::impl::ComponentName
add::crates/another/file.rs::function::AnotherComponent
<EOS>
## Rules
1. Use full paths: `remove::crates/folder/file.rs::Type::Name`
2. Use `::` for nested items: `status::StructName::Type::Name`
3. Always explain "must change because" and "without this"
3. Types of components: function, struct, enum, impl, trait
4. If there is extra information (e.g., enum variants), include that too.
5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>
## Example
##TASK
Add webhook subscription support
##OUTPUT
The webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don't trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings.
##SELECT
crates/common_enums/src/enums.rs::EventClass
crates/common_enums/src/transformers.rs::SubscriptionStatus
<EOS>
# For custom format (only used when format_type="custom")
custom_template: "##INSTRUCTION\n{instruction}<|im_end|>\n##TASK\n{input}<|im_end|>\n##OUTPUT\n{output}<|im_end|>"
max_length: 2048
shuffle: true
num_proc: 4
peft:
enabled: true
r: 8
lora_alpha: 16
lora_dropout: 0.05
bias: "none"
target_modules: "auto"
train:
# max_steps: 10
num_train_epochs: 6
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 1e-4
weight_decay: 0.0
warmup_ratio: 0.08
lr_scheduler_type: "cosine"
optim: "adamw_torch" # ✅ Changed from paged_adamw_8bit (requires use_4bit=true)
max_grad_norm: 0.8
gradient_checkpointing: true
logging_steps: 2
save_strategy: "steps"
save_steps: 500
save_total_limit: 20
evaluation_strategy: "steps"
eval_steps: 100
load_best_model_at_end: true
# Early stopping
early_stopping:
enabled: true
patience: 3 # Number of evaluations with no improvement before stopping
min_delta: 0.001 # Minimum change to qualify as improvement
metric: "eval_loss" # Metric to monitor
mode: "min" # "min" for loss, "max" for accuracy/etc.
resume_from_checkpoint: "auto"
merge:
enabled: true
merged_dtype: "float16"
max_shard_size: "2GB"
output_dir: "./merged_24b_instruct_lora" |