Spaces:
Sleeping
Sleeping
| { | |
| "model_config": { | |
| "model_name_or_path": "unsloth/DeepSeek-R1-Distill-Qwen-14B-bnb-4bit", | |
| "use_cache": false, | |
| "rope_scaling": { | |
| "type": "dynamic", | |
| "factor": 2.0 | |
| } | |
| }, | |
| "training_config": { | |
| "num_train_epochs": 3, | |
| "per_device_train_batch_size": 4, | |
| "gradient_accumulation_steps": 4, | |
| "learning_rate": 2e-5, | |
| "lr_scheduler_type": "cosine", | |
| "warmup_ratio": 0.03, | |
| "weight_decay": 0.01, | |
| "optim": "adamw_torch", | |
| "max_grad_norm": 0.3, | |
| "max_seq_length": 2048, | |
| "logging_steps": 10, | |
| "save_steps": 200, | |
| "save_total_limit": 3, | |
| "evaluation_strategy": "steps", | |
| "eval_steps": 200, | |
| "load_best_model_at_end": true, | |
| "output_dir": "fine_tuned_model", | |
| "disable_tqdm": false, | |
| "report_to": ["tensorboard"], | |
| "logging_first_step": true, | |
| "dataloader_num_workers": 4 | |
| }, | |
| "hardware_config": { | |
| "fp16": true, | |
| "bf16": false, | |
| "gradient_checkpointing": true, | |
| "device_map": "auto", | |
| "attn_implementation": "eager", | |
| "use_flash_attention": false, | |
| "memory_optimization": { | |
| "expandable_segments": true | |
| } | |
| }, | |
| "quantization_config": { | |
| "load_in_4bit": true, | |
| "bnb_4bit_compute_dtype": "float16", | |
| "bnb_4bit_quant_type": "nf4", | |
| "bnb_4bit_use_double_quant": true | |
| }, | |
| "lora_config": { | |
| "r": 16, | |
| "lora_alpha": 32, | |
| "lora_dropout": 0.05, | |
| "bias": "none", | |
| "target_modules": [ | |
| "q_proj", | |
| "k_proj", | |
| "v_proj", | |
| "o_proj", | |
| "gate_proj", | |
| "up_proj", | |
| "down_proj" | |
| ] | |
| }, | |
| "dataset_config": { | |
| "sort_by_field": "prompt_number", | |
| "max_tokens": 2048, | |
| "text_field": "conversations", | |
| "training_phase_only": true, | |
| "pre_tokenized": true, | |
| "input_ids_field": "input_ids", | |
| "skip_tokenization": true | |
| }, | |
| "deepspeed_config": { | |
| "zero_optimization": { | |
| "stage": 2, | |
| "offload_optimizer": { | |
| "device": "cpu", | |
| "pin_memory": true | |
| }, | |
| "contiguous_gradients": true, | |
| "overlap_comm": true, | |
| "reduce_scatter": true, | |
| "reduce_bucket_size": 5e8, | |
| "allgather_bucket_size": 5e8 | |
| }, | |
| "gradient_accumulation_steps": 4, | |
| "gradient_clipping": 0.3, | |
| "fp16": { | |
| "enabled": true, | |
| "loss_scale": 0, | |
| "loss_scale_window": 1000, | |
| "initial_scale_power": 16, | |
| "hysteresis": 2, | |
| "min_loss_scale": 1 | |
| }, | |
| "optimizer": { | |
| "type": "AdamW", | |
| "params": { | |
| "lr": 2e-5, | |
| "betas": [0.9, 0.999], | |
| "eps": 1e-8, | |
| "weight_decay": 0.01 | |
| } | |
| }, | |
| "activation_checkpointing": { | |
| "partition_activations": true, | |
| "cpu_checkpointing": true, | |
| "contiguous_memory_optimization": true, | |
| "number_checkpoints": null, | |
| "synchronize_checkpoint_boundary": false, | |
| "profile": false | |
| }, | |
| "steps_per_print": 10, | |
| "train_batch_size": "auto", | |
| "train_micro_batch_size_per_gpu": "auto", | |
| "wall_clock_breakdown": false | |
| } | |
| } |