Spaces:
Runtime error
Runtime error
File size: 3,012 Bytes
7b02281 a7bd5fa 7b02281 a7bd5fa 7b02281 37edf61 a7bd5fa 7b02281 a7bd5fa 7b02281 a7bd5fa 7b02281 37edf61 7b02281 37edf61 7b02281 37edf61 7b02281 a7bd5fa 7b02281 a7bd5fa 7b02281 37edf61 7b02281 a7bd5fa 7b02281 37edf61 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
# -------------------------------
# Configuraci贸n
# -------------------------------
MODEL_NAME = "codellama/CodeLlama-7b-hf" # Modelo base
LORA_DIR = "lora_codellama" # Carpeta donde se guardar谩 LoRA
DATASET_PATH = "tu_dataset.json" # Tu dataset local (JSON)
# Crear carpeta si no existe
os.makedirs(LORA_DIR, exist_ok=True)
# -------------------------------
# Cargar modelo y tokenizer
# -------------------------------
print("Cargando modelo base...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto",
torch_dtype=torch.float16
)
# -------------------------------
# Configurar LoRA
# -------------------------------
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj","v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
# -------------------------------
# Cargar dataset
# -------------------------------
dataset = load_dataset("json", data_files=DATASET_PATH)
dataset = dataset["train"]
print("Columnas del dataset:", dataset.column_names)
# -------------------------------
# Funci贸n de tokenizaci贸n
# -------------------------------
def tokenize_function(examples):
# Detectar columnas autom谩ticamente
columns = dataset.column_names
if "prompt" in columns and "completion" in columns:
texts = [p + "\n" + c for p, c in zip(examples["prompt"], examples["completion"])]
elif "text" in columns:
texts = examples["text"]
else:
# Si no encuentra las columnas, lanza un error con info
raise ValueError(f"Columnas inv谩lidas en dataset: {columns}")
return tokenizer(texts, truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
# -------------------------------
# Entrenamiento
# -------------------------------
training_args = TrainingArguments(
output_dir=LORA_DIR,
num_train_epochs=1, # Ajusta seg煤n tu tiempo y GPU
per_device_train_batch_size=1,
save_steps=500,
save_total_limit=1,
logging_steps=50,
learning_rate=2e-4,
fp16=True,
gradient_accumulation_steps=4,
push_to_hub=False
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets,
data_collator=data_collator
)
print("Comenzando entrenamiento de LoRA...")
trainer.train()
# -------------------------------
# Guardar LoRA
# -------------------------------
print("Guardando LoRA en la carpeta:", LORA_DIR)
model.save_pretrained(LORA_DIR)
print("隆Entrenamiento completado! LoRA lista para producci贸n.") |