yotsubian
/

qwen

+model:
+  name: "Qwen/Qwen3-4B-Instruct-2507"
+  torch_dtype: "bfloat16"
+  trust_remote_code: false
+lora:
+  r: 16
+  lora_alpha: 32
+  target_modules: ["q_proj", "v_proj", "k_proj", "o_proj"]
+  lora_dropout: 0.05
+  bias: "none"
+training:
+  num_epochs: 3
+  per_device_batch_size: 2
+  gradient_accumulation_steps: 3
+  learning_rate: 0.0002
+  lr_scheduler: "cosine"
+  warmup_ratio: 0.05
+  bf16: true
+  gradient_checkpointing: false
+  logging_steps: 20
+  save_strategy: "epoch"
+  save_total_limit: 3
+  max_length: 1024
+generation:
+  max_new_tokens: 512
+  temperature: 1.0
+  n_samples: 32
+paths:
+  checkpoint_dir: "checkpoints/"
+  results_dir: "results/"
+  data_dir: "data/"
+lean_server:
+  url: "http://localhost:8000"
+  timeout: 60
+  max_workers: 32