import os import torch from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling from datasets import load_dataset from peft import LoraConfig, get_peft_model # ------------------------------- # Configuración # ------------------------------- MODEL_NAME = "codellama/CodeLlama-7b-hf" # Modelo base LORA_DIR = "lora_codellama" # Carpeta donde se guardará LoRA DATASET_PATH = "tu_dataset.json" # Tu dataset local (JSONL o JSON) # Crear carpeta si no existe os.makedirs(LORA_DIR, exist_ok=True) # ------------------------------- # Cargar modelo y tokenizer # ------------------------------- print("Cargando modelo base...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="auto", torch_dtype=torch.float16 ) # ------------------------------- # Configurar LoRA # ------------------------------- lora_config = LoraConfig( r=16, lora_alpha=32, target_modules=["q_proj","v_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) # ------------------------------- # Cargar dataset # ------------------------------- dataset = load_dataset("json", data_files=DATASET_PATH) dataset = dataset["train"] # Asume que el JSON tiene solo la parte de entrenamiento def tokenize_function(examples): return tokenizer(examples["text"], truncation=True, max_length=512) tokenized_datasets = dataset.map(tokenize_function, batched=True) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) # ------------------------------- # Entrenamiento # ------------------------------- training_args = TrainingArguments( output_dir=LORA_DIR, num_train_epochs=1, # Ajusta según tu tiempo per_device_train_batch_size=1, save_steps=500, save_total_limit=1, logging_steps=50, learning_rate=2e-4, fp16=True, gradient_accumulation_steps=4, push_to_hub=False ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets, data_collator=data_collator ) print("Comenzando entrenamiento de LoRA...") trainer.train() # ------------------------------- # Guardar LoRA # ------------------------------- print("Guardando LoRA en la carpeta:", LORA_DIR) model.save_pretrained(LORA_DIR) print("¡Entrenamiento completado! Ahora tu LoRA está lista para producción.")