The model to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"
The instruction dataset to use
dataset_name = "mlabonne/guanaco-llama2-1k"
Fine-tuned model name
new_model = "llama-2-7b-miniguanaco"
################################################################################
QLoRA parameters
################################################################################
LoRA attention dimension
lora_r = 64
Alpha parameter for LoRA scaling
lora_alpha = 16
Dropout probability for LoRA layers
lora_dropout = 0.1
################################################################################
bitsandbytes parameters
################################################################################
Activate 4-bit precision base model loading
use_4bit = True
Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False
################################################################################
TrainingArguments parameters
################################################################################
Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"
Number of training epochs
num_train_epochs = 1
Enable fp16/bf16 training
fp16 = False bf16 = False
Batch size per GPU for training
per_device_train_batch_size = 1
Batch size per GPU for evaluation
per_device_eval_batch_size = 1
Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1
Enable gradient checkpointing
gradient_checkpointing = True
Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
Initial learning rate (AdamW optimizer)
learning_rate = 2e-4
Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
Optimizer to use
optim = "paged_adamw_32bit"
Learning rate schedule
lr_scheduler_type = "cosine"
Number of training steps (overrides num_train_epochs)
max_steps = -1
Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
Group sequences into batches with same length
group_by_length = True
Save checkpoint every X updates steps
save_steps = 0
Log every X updates steps
logging_steps = 25
################################################################################
SFT parameters
################################################################################
Maximum sequence length to use
max_seq_length = None # output:You didn't pass a max_seq_length argument to the SFTTrainer, this will default to 1024
Pack multiple short examples in the same input sequence to increase efficiency
packing = False
Load the entire model on the GPU 0
device_map = "auto"
- Downloads last month
- 3