Spaces:
Runtime error
Runtime error
| import os | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling | |
| from datasets import load_dataset | |
| from peft import LoraConfig, get_peft_model | |
| # ------------------------------- | |
| # Configuraci贸n | |
| # ------------------------------- | |
| MODEL_NAME = "codellama/CodeLlama-7b-hf" # Modelo base | |
| LORA_DIR = "lora_codellama" # Carpeta donde se guardar谩 LoRA | |
| DATASET_PATH = "tu_dataset.json" # Tu dataset local (JSON) | |
| # Crear carpeta si no existe | |
| os.makedirs(LORA_DIR, exist_ok=True) | |
| # ------------------------------- | |
| # Cargar modelo y tokenizer | |
| # ------------------------------- | |
| print("Cargando modelo base...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| device_map="auto", | |
| torch_dtype=torch.float16 | |
| ) | |
| # ------------------------------- | |
| # Configurar LoRA | |
| # ------------------------------- | |
| lora_config = LoraConfig( | |
| r=16, | |
| lora_alpha=32, | |
| target_modules=["q_proj","v_proj"], | |
| lora_dropout=0.05, | |
| bias="none", | |
| task_type="CAUSAL_LM" | |
| ) | |
| model = get_peft_model(model, lora_config) | |
| # ------------------------------- | |
| # Cargar dataset | |
| # ------------------------------- | |
| dataset = load_dataset("json", data_files=DATASET_PATH) | |
| dataset = dataset["train"] | |
| print("Columnas del dataset:", dataset.column_names) | |
| # ------------------------------- | |
| # Funci贸n de tokenizaci贸n | |
| # ------------------------------- | |
| def tokenize_function(examples): | |
| # Detectar columnas autom谩ticamente | |
| columns = dataset.column_names | |
| if "prompt" in columns and "completion" in columns: | |
| texts = [p + "\n" + c for p, c in zip(examples["prompt"], examples["completion"])] | |
| elif "text" in columns: | |
| texts = examples["text"] | |
| else: | |
| # Si no encuentra las columnas, lanza un error con info | |
| raise ValueError(f"Columnas inv谩lidas en dataset: {columns}") | |
| return tokenizer(texts, truncation=True, max_length=512) | |
| tokenized_datasets = dataset.map(tokenize_function, batched=True) | |
| data_collator = DataCollatorForLanguageModeling( | |
| tokenizer=tokenizer, | |
| mlm=False | |
| ) | |
| # ------------------------------- | |
| # Entrenamiento | |
| # ------------------------------- | |
| training_args = TrainingArguments( | |
| output_dir=LORA_DIR, | |
| num_train_epochs=1, # Ajusta seg煤n tu tiempo y GPU | |
| per_device_train_batch_size=1, | |
| save_steps=500, | |
| save_total_limit=1, | |
| logging_steps=50, | |
| learning_rate=2e-4, | |
| fp16=True, | |
| gradient_accumulation_steps=4, | |
| push_to_hub=False | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_datasets, | |
| data_collator=data_collator | |
| ) | |
| print("Comenzando entrenamiento de LoRA...") | |
| trainer.train() | |
| # ------------------------------- | |
| # Guardar LoRA | |
| # ------------------------------- | |
| print("Guardando LoRA en la carpeta:", LORA_DIR) | |
| model.save_pretrained(LORA_DIR) | |
| print("隆Entrenamiento completado! LoRA lista para producci贸n.") |