YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

The model to train from the Hugging Face hub

model_name = "NousResearch/Llama-2-7b-chat-hf"

The instruction dataset to use

dataset_name = "mlabonne/guanaco-llama2-1k"

Fine-tuned model name

new_model = "llama-2-7b-miniguanaco"

################################################################################

QLoRA parameters

################################################################################

LoRA attention dimension

lora_r = 64

Alpha parameter for LoRA scaling

lora_alpha = 16

Dropout probability for LoRA layers

lora_dropout = 0.1

################################################################################

bitsandbytes parameters

################################################################################

Activate 4-bit precision base model loading

use_4bit = True

Compute dtype for 4-bit base models

bnb_4bit_compute_dtype = "float16"

Quantization type (fp4 or nf4)

bnb_4bit_quant_type = "nf4"

Activate nested quantization for 4-bit base models (double quantization)

use_nested_quant = False

################################################################################

TrainingArguments parameters

################################################################################

Output directory where the model predictions and checkpoints will be stored

output_dir = "./results"

Number of training epochs

num_train_epochs = 1

Enable fp16/bf16 training

fp16 = False bf16 = False

Batch size per GPU for training

per_device_train_batch_size = 1

Batch size per GPU for evaluation

per_device_eval_batch_size = 1

Number of update steps to accumulate the gradients for

gradient_accumulation_steps = 1

Enable gradient checkpointing

gradient_checkpointing = True

Maximum gradient normal (gradient clipping)

max_grad_norm = 0.3

Initial learning rate (AdamW optimizer)

learning_rate = 2e-4

Weight decay to apply to all layers except bias/LayerNorm weights

weight_decay = 0.001

Optimizer to use

optim = "paged_adamw_32bit"

Learning rate schedule

lr_scheduler_type = "cosine"

Number of training steps (overrides num_train_epochs)

max_steps = -1

Ratio of steps for a linear warmup (from 0 to learning rate)

warmup_ratio = 0.03

Group sequences into batches with same length

group_by_length = True

Save checkpoint every X updates steps

save_steps = 0

Log every X updates steps

logging_steps = 25

################################################################################

SFT parameters

################################################################################

Maximum sequence length to use

max_seq_length = None # output：You didn't pass a max_seq_length argument to the SFTTrainer, this will default to 1024

Pack multiple short examples in the same input sequence to increase efficiency

packing = False

Load the entire model on the GPU 0

device_map = "auto"

Downloads last month: 3

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support