ssaraf1 commited on
Commit
1c623d5
·
verified ·
1 Parent(s): e1b8183

Upload lora_config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. lora_config.yaml +88 -0
lora_config.yaml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================
2
+ # MLX-LM LoRA Fine-Tuning Config
3
+ # SLM Workflow Planner — Qwen2.5-7B-Instruct
4
+ # =============================================================
5
+ #
6
+ # Optimized for:
7
+ # - Apple M4 Pro (48GB unified memory)
8
+ # - Policy classification task (structured output)
9
+ # - 554K instruction pairs from 89-workflow multi-topology corpus
10
+ #
11
+ # Training objective:
12
+ # Stage 1: decision_type classification (NEXT/FORK/JOIN/RETRY/META)
13
+ # Stage 2: node subset selection from eligible candidates
14
+ #
15
+ # Key tuning decisions:
16
+ # - LR 8e-5 (lower for 7B stability + structured output)
17
+ # - 8000 iters ≈ 6.4% epoch (sufficient for topology generalization)
18
+ # - num_layers 28/32 (planner reasoning in mid-upper stack)
19
+ # - dropout 0.02 (dataset large enough, avoid slow convergence)
20
+ # - warmup 400 (5% of 8000 iters)
21
+ # =============================================================
22
+
23
+ # --- Model ---
24
+ model: "Qwen/Qwen2.5-7B-Instruct"
25
+
26
+ # --- Training ---
27
+ train: true
28
+ fine_tune_type: "lora"
29
+ optimizer: "adam"
30
+
31
+ # --- Iterations ---
32
+ # Dataset: 554K instruction pairs → ~499K train
33
+ # At batch_size=4: 499K/4 = 124,750 steps per epoch
34
+ # 8000 iters ≈ 6.4% epoch — enough for policy + topology learning
35
+ # without overfit risk
36
+ iters: 8000
37
+
38
+ batch_size: 4 # 7B on 48GB — safe headroom
39
+ max_seq_length: 512 # Prompts avg ~65-115 tokens, 512 gives headroom
40
+
41
+ # --- Learning rate ---
42
+ # 8e-5 is in the safe zone for 7B LoRA on classification tasks
43
+ # (1.5e-4 was borderline high — risk of logit instability)
44
+ # cosine_decay(init, decay_steps, end)
45
+ learning_rate: 8.0e-5
46
+ lr_schedule:
47
+ name: "cosine_decay"
48
+ arguments: [8.0e-5, 8000, 1.0e-6]
49
+ warmup: 400 # 5% warmup (400/8000)
50
+ warmup_init: 0.0
51
+
52
+ # --- LoRA parameters ---
53
+ # rank=16 sufficient for policy classification
54
+ # scale = alpha/rank = 32/16 = 2.0
55
+ # Qwen2.5-7B has 32 layers — LoRA on last 28 (87.5%)
56
+ # Planner reasoning lives in mid-upper stack
57
+ num_layers: 28
58
+ lora_parameters:
59
+ rank: 16
60
+ dropout: 0.02 # Lower dropout: 554K samples, avoid slow convergence
61
+ scale: 2.0
62
+
63
+ # --- Prompt masking ---
64
+ # Critical: only train on assistant output (decision), not the prompt
65
+ mask_prompt: true
66
+
67
+ # --- Gradient ---
68
+ grad_checkpoint: true # Essential for 7B on 48GB
69
+ grad_accumulation_steps: 2 # Effective batch = 4 × 2 = 8
70
+
71
+ # --- Logging & saving ---
72
+ steps_per_report: 50
73
+ steps_per_eval: 100 # More frequent eval for planner loss curves (jagged)
74
+ val_batches: 100 # 100 × 4 = 400 samples per eval (less noisy)
75
+ save_every: 50 # Frequent saves — crash-proof, resume from last checkpoint
76
+
77
+ # --- Data ---
78
+ data: "src_slm/training/data"
79
+
80
+ # --- Adapter output ---
81
+ adapter_path: "src_slm/training/adapters_7b"
82
+
83
+ # --- Evaluation ---
84
+ test: true
85
+ test_batches: 200 # Thorough test evaluation
86
+
87
+ # --- Reproducibility ---
88
+ seed: 42