Spaces:
Sleeping
Sleeping
| # The path to the local model directory or Hugging Face repo. | |
| model: "mlx-community/Qwen2.5-7B-Instruct-4bit" | |
| # Whether or not to train (boolean) | |
| train: true | |
| # The fine-tuning method: "lora", "dora", or "full". | |
| fine_tune_type: lora | |
| # The Optimizer with its possible inputs | |
| optimizer: adamw | |
| # optimizer_config: | |
| # adamw: | |
| # betas: [0.9, 0.98] | |
| # eps: 1e-6 | |
| # weight_decay: 0.05 | |
| # bias_correction: true | |
| # Directory with {train, valid, test}.jsonl files | |
| data: "resources/data/mlx_train_data" | |
| # The PRNG seed | |
| seed: 0 | |
| # Number of layers to fine-tune | |
| num_layers: 28 | |
| # Minibatch size. | |
| batch_size: 2 | |
| # Iterations to train for. | |
| iters: 200 | |
| # Number of validation batches, -1 uses the entire validation set. | |
| val_batches: 25 | |
| # Adam learning rate. | |
| learning_rate: 1e-5 | |
| # Number of training steps between loss reporting. | |
| steps_per_report: 10 | |
| # Number of training steps between validations. | |
| steps_per_eval: 10 | |
| # Load path to resume training with the given adapter weights. | |
| resume_adapter_file: null | |
| # Save/load path for the trained adapter weights. | |
| adapter_path: "resources/model/output/mlx/adapters" | |
| # Save the model every N iterations. | |
| save_every: 100 | |
| # Evaluate on the test set after training | |
| test: false | |
| # Number of test set batches, -1 uses the entire test set. | |
| test_batches: 100 | |
| # Maximum sequence length. | |
| max_seq_length: 2048 | |
| # Use gradient checkpointing to reduce memory use. | |
| grad_checkpoint: false | |
| # LoRA parameters can only be specified in a config file | |
| lora_parameters: | |
| # The layer keys to apply LoRA to. | |
| # These will be applied for the last lora_layers | |
| keys: ["self_attn.q_proj", "self_attn.v_proj"] | |
| rank: 32 | |
| scale: 10.0 | |
| dropout: 0.1 | |
| alpha: 64 | |
| # Schedule can only be specified in a config file, uncomment to use. | |
| #lr_schedule: | |
| # name: cosine_decay | |
| # warmup: 100 # 0 for no warmup | |
| # warmup_init: 1e-7 # 0 if not specified | |
| # arguments: [1e-5, 1000, 1e-7] # passed to scheduler | |
| #hf_dataset: | |
| # name: "billsum" | |
| # train_split: "train[:1000]" | |
| # valid_split: "train[-100:]" | |
| # prompt_feature: "text" | |
| # completion_feature: "summary" | |