File size: 2,514 Bytes
7b02281
a7bd5fa
7b02281
 
 
a7bd5fa
7b02281
 
 
 
 
 
a7bd5fa
7b02281
 
 
 
 
 
 
a7bd5fa
 
 
 
 
 
 
7b02281
 
 
 
 
 
 
 
 
 
 
a7bd5fa
7b02281
 
 
 
 
 
 
 
 
 
 
 
 
 
a7bd5fa
7b02281
a7bd5fa
 
7b02281
 
 
 
 
 
 
 
 
 
 
 
 
 
a7bd5fa
 
7b02281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

# -------------------------------
# Configuraci贸n
# -------------------------------
MODEL_NAME = "codellama/CodeLlama-7b-hf"   # Modelo base
LORA_DIR = "lora_codellama"               # Carpeta donde se guardar谩 LoRA
DATASET_PATH = "tu_dataset.json"          # Tu dataset local (JSONL o JSON)

# Crear carpeta si no existe
os.makedirs(LORA_DIR, exist_ok=True)

# -------------------------------
# Cargar modelo y tokenizer
# -------------------------------
print("Cargando modelo base...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16
)

# -------------------------------
# Configurar LoRA
# -------------------------------
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# -------------------------------
# Cargar dataset
# -------------------------------
dataset = load_dataset("json", data_files=DATASET_PATH)
dataset = dataset["train"]  # Asume que el JSON tiene solo la parte de entrenamiento

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# -------------------------------
# Entrenamiento
# -------------------------------
training_args = TrainingArguments(
    output_dir=LORA_DIR,
    num_train_epochs=1,          # Ajusta seg煤n tu tiempo
    per_device_train_batch_size=1,
    save_steps=500,
    save_total_limit=1,
    logging_steps=50,
    learning_rate=2e-4,
    fp16=True,
    gradient_accumulation_steps=4,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator
)

print("Comenzando entrenamiento de LoRA...")
trainer.train()

# -------------------------------
# Guardar LoRA
# -------------------------------
print("Guardando LoRA en la carpeta:", LORA_DIR)
model.save_pretrained(LORA_DIR)
print("隆Entrenamiento completado! Ahora tu LoRA est谩 lista para producci贸n.")