| data: | |
| collator: | |
| pad_to_multiple_of: 8 | |
| dataloader: | |
| drop_last: true | |
| num_workers: 4 | |
| pin_memory: true | |
| shuffle: true | |
| processed_dir: finetune_processed_experiences | |
| fsdp: | |
| activation_checkpointing: true | |
| mixed_precision: true | |
| sharding_strategy: FULL_SHARD | |
| gpu: | |
| data_parallel: true | |
| single_gpu: false | |
| huggingface: | |
| create_model_card: true | |
| repo_name: dtadpole/KernelCoder-4B_20250621-071556 | |
| upload: true | |
| lora: | |
| alpha: 64 | |
| bias: none | |
| dropout: 0.05 | |
| r: 64 | |
| target_modules: | |
| - q_proj | |
| - k_proj | |
| - v_proj | |
| - o_proj | |
| - gate_proj | |
| - down_proj | |
| - up_proj | |
| model: | |
| dtype: null | |
| load_in_4bit: true | |
| max_seq_length: 16384 | |
| name: Qwen/Qwen3-4B | |
| test: | |
| default_prompt: '<|im_start|>system | |
| You are a helpful assistant.<|im_end|> | |
| <|im_start|>user | |
| What is machine learning?<|im_end|> | |
| <|im_start|>assistant | |
| ' | |
| generation: | |
| do_sample: true | |
| max_new_tokens: 1024 | |
| temperature: 0.7 | |
| use_cache: true | |
| training: | |
| gradient_accumulation_steps: 1 | |
| learning_rate: 5.0e-05 | |
| logging_steps: 1 | |
| lr_scheduler_type: cosine | |
| max_grad_norm: 0.75 | |
| max_steps: -1 | |
| num_train_epochs: 2 | |
| num_workers: 4 | |
| optim: paged_adamw_8bit | |
| output_dir: ../finetune_model_output | |
| per_device_batch_size: 1 | |
| save_steps: 100 | |
| save_total_limit: 3 | |
| seed: 3407 | |
| use_custom_loss_masking: true | |
| warmup_steps: 10 | |
| weight_decay: 0.05 | |