from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer from peft import LoraConfig, get_peft_model import torch BASE_MODEL = "meta-llama/Llama-2-7b-hf" DATASET = "walter-taya/code-dataset" OUTPUT = "./output" tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, load_in_8bit=True, device_map="auto" ) lora = LoraConfig( r=8, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, task_type="CAUSAL_LM", ) model = get_peft_model(model, lora) model.print_trainable_parameters() ds = load_dataset(DATASET) def tokenize(x): out = tokenizer( x["text"], truncation=True, padding="max_length", max_length=512 ) out["labels"] = out["input_ids"].copy() return out ds = ds["train"].shuffle().map(tokenize, remove_columns=["text"]) args = TrainingArguments( output_dir=OUTPUT, per_device_train_batch_size=1, gradient_accumulation_steps=8, num_train_epochs=1, fp16=True, logging_steps=10, save_strategy="epoch", push_to_hub=True, hub_model_id="walter-taya/llama2-code-lora" ) trainer = Trainer( model=model, args=args, train_dataset=ds ) trainer.train() trainer.push_to_hub()