run: run_dir: "./runs/instruct_run_24b" seed: 42 # WandB integration for experiment tracking wandb: enabled: true # Set to true to enable wandb logging project: "sft-training" # WandB project name entity: null # WandB entity/team (optional) name: null # Run name (optional, will auto-generate if null) tags: ["sft-lora", "24b-Devstral"] # List of tags for the run (e.g., ["lora", "qlora", "experiment-1"]) notes: null # Run description/notes (optional) model: # Use local Qwen2.5-Coder-14B model repo_id: "./CPT/runs/cpt_run_v1/merged_24b_cpt_lora" revision: null # Used only when repo_id is a HF repo (not a local path) base_local_dir: "base_model" trust_remote_code: true tokenizer_use_fast: true device_map: "auto" torch_dtype: "bfloat16" # "float16" | "bfloat16" | "float32" # QLoRA use_4bit: false bnb_4bit_quant_type: "nf4" bnb_4bit_use_double_quant: false bnb_4bit_compute_dtype: "bfloat16" # optional: "flash_attention_2" | "sdpa" | null attn_implementation: null data: train_jsonl: "../sft_dataset.jsonl" eval_jsonl: null eval_split_ratio: 0.1 # Field names in your JSONL data instruction_field: "instruction" # This will be the system prompt input_field: "input" # This is the task description output_field: "output" # This is the analysis + selection # Formatting options format_type: "custom" # "chatml" | "alpaca" | "custom" # For chatml format system_prompt: | You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task. ## Output Format ##OUTPUT Explain the data flow and why each component must change: - Flow: [Input → Processing → Output with arrows] - For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]" - Explain coupling between components ##SELECT modify::crates/path/to/file.rs::impl::ComponentName add::crates/another/file.rs::function::AnotherComponent ## Rules 1. Use full paths: `remove::crates/folder/file.rs::Type::Name` 2. Use `::` for nested items: `status::StructName::Type::Name` 3. Always explain "must change because" and "without this" 3. Types of components: function, struct, enum, impl, trait 4. If there is extra information (e.g., enum variants), include that too. 5. Start with ##OUTPUT, end with ##SELECT, terminate with ## Example ##TASK Add webhook subscription support ##OUTPUT The webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don't trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings. ##SELECT crates/common_enums/src/enums.rs::EventClass crates/common_enums/src/transformers.rs::SubscriptionStatus # For custom format (only used when format_type="custom") custom_template: "##INSTRUCTION\n{instruction}<|im_end|>\n##TASK\n{input}<|im_end|>\n##OUTPUT\n{output}<|im_end|>" max_length: 2048 shuffle: true num_proc: 4 peft: enabled: true r: 8 lora_alpha: 16 lora_dropout: 0.05 bias: "none" target_modules: "auto" train: # max_steps: 10 num_train_epochs: 6 per_device_train_batch_size: 1 per_device_eval_batch_size: 1 gradient_accumulation_steps: 8 learning_rate: 1e-4 weight_decay: 0.0 warmup_ratio: 0.08 lr_scheduler_type: "cosine" optim: "adamw_torch" # ✅ Changed from paged_adamw_8bit (requires use_4bit=true) max_grad_norm: 0.8 gradient_checkpointing: true logging_steps: 2 save_strategy: "steps" save_steps: 500 save_total_limit: 20 evaluation_strategy: "steps" eval_steps: 100 load_best_model_at_end: true # Early stopping early_stopping: enabled: true patience: 3 # Number of evaluations with no improvement before stopping min_delta: 0.001 # Minimum change to qualify as improvement metric: "eval_loss" # Metric to monitor mode: "min" # "min" for loss, "max" for accuracy/etc. resume_from_checkpoint: "auto" merge: enabled: true merged_dtype: "float16" max_shard_size: "2GB" output_dir: "./merged_24b_instruct_lora"