File size: 4,633 Bytes
e527a65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
run:
  run_dir: "./runs/instruct_run_24b"
  seed: 42

# WandB integration for experiment tracking
wandb:
  enabled: true  # Set to true to enable wandb logging
  project: "sft-training"  # WandB project name
  entity: null  # WandB entity/team (optional)
  name: null  # Run name (optional, will auto-generate if null)
  tags: ["sft-lora", "24b-Devstral"]  # List of tags for the run (e.g., ["lora", "qlora", "experiment-1"])
  notes: null  # Run description/notes (optional)

model:
  # Use local Qwen2.5-Coder-14B model
  repo_id: "./CPT/runs/cpt_run_v1/merged_24b_cpt_lora"
  revision: null

  # Used only when repo_id is a HF repo (not a local path)
  base_local_dir: "base_model"

  trust_remote_code: true
  tokenizer_use_fast: true
  device_map: "auto"

  torch_dtype: "bfloat16"  # "float16" | "bfloat16" | "float32"

  # QLoRA
  use_4bit: false
  bnb_4bit_quant_type: "nf4"
  bnb_4bit_use_double_quant: false
  bnb_4bit_compute_dtype: "bfloat16"

  # optional: "flash_attention_2" | "sdpa" | null
  attn_implementation: null

data:
  train_jsonl: "../sft_dataset.jsonl"
  eval_jsonl: null
  eval_split_ratio: 0.1
  
  # Field names in your JSONL data
  instruction_field: "instruction"  # This will be the system prompt
  input_field: "input"             # This is the task description
  output_field: "output"           # This is the analysis + selection
  
  # Formatting options
  format_type: "custom"  # "chatml" | "alpaca" | "custom"
  
  # For chatml format
  system_prompt: |
    You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.

    ## Output Format

    ##OUTPUT
    Explain the data flow and why each component must change:
    - Flow: [Input  Processing  Output with arrows]
    - For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"
    - Explain coupling between components

    ##SELECT
    modify::crates/path/to/file.rs::impl::ComponentName
    add::crates/another/file.rs::function::AnotherComponent
    <EOS>

    ## Rules

    1. Use full paths: `remove::crates/folder/file.rs::Type::Name`
    2. Use `::` for nested items: `status::StructName::Type::Name`
    3. Always explain "must change because" and "without this"
    3. Types of components: function, struct, enum, impl, trait
    4. If there is extra information (e.g., enum variants), include that too.
    5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>

    ## Example

    ##TASK
    Add webhook subscription support

    ##OUTPUT
    The webhook system routes events via EventClass enum. Flow: webhook  EventClass  handler  processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don't trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings.

    ##SELECT
    crates/common_enums/src/enums.rs::EventClass
    crates/common_enums/src/transformers.rs::SubscriptionStatus
    <EOS>
  
  # For custom format (only used when format_type="custom")
  custom_template: "##INSTRUCTION\n{instruction}<|im_end|>\n##TASK\n{input}<|im_end|>\n##OUTPUT\n{output}<|im_end|>"
  
  max_length: 2048
  shuffle: true
  num_proc: 4

peft:
  enabled: true
  r: 8
  lora_alpha: 16
  lora_dropout: 0.05
  bias: "none"
  target_modules: "auto"

train:
  # max_steps: 10
  num_train_epochs: 6

  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  gradient_accumulation_steps: 8

  learning_rate: 1e-4
  weight_decay: 0.0
  warmup_ratio: 0.08
  lr_scheduler_type: "cosine"

  optim: "adamw_torch"  # ✅ Changed from paged_adamw_8bit (requires use_4bit=true)
  max_grad_norm: 0.8
  gradient_checkpointing: true

  logging_steps: 2
  save_strategy: "steps"
  save_steps: 500
  save_total_limit: 20

  evaluation_strategy: "steps"
  eval_steps: 100
  load_best_model_at_end: true

  # Early stopping
  early_stopping:
    enabled: true
    patience: 3  # Number of evaluations with no improvement before stopping
    min_delta: 0.001  # Minimum change to qualify as improvement
    metric: "eval_loss"  # Metric to monitor
    mode: "min"  # "min" for loss, "max" for accuracy/etc.

  resume_from_checkpoint: "auto"

merge:
  enabled: true
  merged_dtype: "float16"
  max_shard_size: "2GB"
  output_dir: "./merged_24b_instruct_lora"