Spaces:
Sleeping
Sleeping
| { | |
| "model_config": { | |
| "model_name_or_path": "unsloth/DeepSeek-R1-Distill-Qwen-14B-bnb-4bit", | |
| "use_cache": false, | |
| "rope_scaling": { | |
| "type": "dynamic", | |
| "factor": 2.0 | |
| } | |
| }, | |
| "training_config": { | |
| "num_train_epochs": 3, | |
| "per_device_train_batch_size": 2, | |
| "gradient_accumulation_steps": 4, | |
| "learning_rate": 2e-5, | |
| "lr_scheduler_type": "cosine", | |
| "warmup_ratio": 0.03, | |
| "weight_decay": 0.01, | |
| "optim": "adamw_torch", | |
| "max_grad_norm": 0.3, | |
| "max_seq_length": 2048, | |
| "logging_steps": 10, | |
| "save_steps": 200, | |
| "save_total_limit": 3, | |
| "evaluation_strategy": "steps", | |
| "eval_steps": 200, | |
| "load_best_model_at_end": true, | |
| "output_dir": "fine_tuned_model", | |
| "disable_tqdm": false, | |
| "report_to": ["tensorboard"], | |
| "logging_first_step": true | |
| }, | |
| "hardware_config": { | |
| "fp16": true, | |
| "bf16": false, | |
| "gradient_checkpointing": true, | |
| "device_map": "auto", | |
| "attn_implementation": "eager" | |
| }, | |
| "quantization_config": { | |
| "load_in_4bit": true, | |
| "bnb_4bit_compute_dtype": "float16", | |
| "bnb_4bit_quant_type": "nf4", | |
| "bnb_4bit_use_double_quant": true | |
| }, | |
| "lora_config": { | |
| "r": 16, | |
| "lora_alpha": 32, | |
| "lora_dropout": 0.05, | |
| "bias": "none", | |
| "target_modules": [ | |
| "q_proj", | |
| "k_proj", | |
| "v_proj", | |
| "o_proj", | |
| "gate_proj", | |
| "up_proj", | |
| "down_proj" | |
| ] | |
| }, | |
| "dataset_config": { | |
| "sort_by_field": "prompt_number", | |
| "max_tokens": 2048, | |
| "text_field": "conversations", | |
| "training_phase_only": true, | |
| "pre_tokenized": true, | |
| "input_ids_field": "input_ids", | |
| "skip_tokenization": true | |
| } | |
| } |