Andro0s's picture
Update app.py
7b02281 verified
raw
history blame
2.51 kB
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
# -------------------------------
# Configuraci贸n
# -------------------------------
MODEL_NAME = "codellama/CodeLlama-7b-hf" # Modelo base
LORA_DIR = "lora_codellama" # Carpeta donde se guardar谩 LoRA
DATASET_PATH = "tu_dataset.json" # Tu dataset local (JSONL o JSON)
# Crear carpeta si no existe
os.makedirs(LORA_DIR, exist_ok=True)
# -------------------------------
# Cargar modelo y tokenizer
# -------------------------------
print("Cargando modelo base...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto",
torch_dtype=torch.float16
)
# -------------------------------
# Configurar LoRA
# -------------------------------
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj","v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
# -------------------------------
# Cargar dataset
# -------------------------------
dataset = load_dataset("json", data_files=DATASET_PATH)
dataset = dataset["train"] # Asume que el JSON tiene solo la parte de entrenamiento
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
# -------------------------------
# Entrenamiento
# -------------------------------
training_args = TrainingArguments(
output_dir=LORA_DIR,
num_train_epochs=1, # Ajusta seg煤n tu tiempo
per_device_train_batch_size=1,
save_steps=500,
save_total_limit=1,
logging_steps=50,
learning_rate=2e-4,
fp16=True,
gradient_accumulation_steps=4,
push_to_hub=False
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets,
data_collator=data_collator
)
print("Comenzando entrenamiento de LoRA...")
trainer.train()
# -------------------------------
# Guardar LoRA
# -------------------------------
print("Guardando LoRA en la carpeta:", LORA_DIR)
model.save_pretrained(LORA_DIR)
print("隆Entrenamiento completado! Ahora tu LoRA est谩 lista para producci贸n.")