File size: 1,663 Bytes
f6bdcf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ef55a9
 
f6bdcf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
{
  "model_config": {
    "model_name_or_path": "unsloth/DeepSeek-R1-Distill-Qwen-14B-bnb-4bit",
    "use_cache": false,
    "rope_scaling": {
      "type": "dynamic",
      "factor": 2.0
    }
  },
  "training_config": {
    "num_train_epochs": 3,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "learning_rate": 2e-5,
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.03,
    "weight_decay": 0.01,
    "optim": "adamw_torch",
    "max_grad_norm": 0.3,
    "max_seq_length": 2048,
    "logging_steps": 10,
    "save_steps": 200,
    "save_total_limit": 3,
    "evaluation_strategy": "steps",
    "eval_steps": 200,
    "load_best_model_at_end": true,
    "output_dir": "fine_tuned_model",
    "disable_tqdm": false,
    "report_to": ["tensorboard"],
    "logging_first_step": true
  },
  "hardware_config": {
    "fp16": true,
    "bf16": false,
    "gradient_checkpointing": true,
    "device_map": "auto",
    "attn_implementation": "eager"
  },
  "quantization_config": {
    "load_in_4bit": true,
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true
  },
  "lora_config": {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "target_modules": [
      "q_proj",
      "k_proj",
      "v_proj",
      "o_proj",
      "gate_proj",
      "up_proj",
      "down_proj"
    ]
  },
  "dataset_config": {
    "sort_by_field": "prompt_number",
    "max_tokens": 2048,
    "text_field": "conversations",
    "training_phase_only": true,
    "pre_tokenized": true,
    "input_ids_field": "input_ids",
    "skip_tokenization": true
  }
}