File size: 1,478 Bytes
bc19ef1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def train_lora(epochs, batch_size, learning_rate):
    try:
        dataset = load_dataset("json", data_files=DATASET_PATH)
        
        # Tokenización correcta
        def tokenize_fn(example):
            return tokenizer(
                example["prompt"] + example["completion"],
                truncation=True,
                padding="max_length",
                max_length=256,
            )

        tokenized = dataset.map(tokenize_fn, batched=False)

        # Asegúrate que las columnas correctas estén
        tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

        training_args = TrainingArguments(
            output_dir=LORA_PATH,
            per_device_train_batch_size=int(batch_size),
            num_train_epochs=int(epochs),
            learning_rate=learning_rate,
            save_total_limit=1,
            logging_steps=10,
            push_to_hub=False
        )

        trainer = Trainer(
            model=base_model,
            args=training_args,
            train_dataset=tokenized["train"],
            data_collator=data_collator,
        )

        trainer.train()
        base_model.save_pretrained(LORA_PATH)
        tokenizer.save_pretrained(LORA_PATH)

        return "✅ Entrenamiento completado y guardado en ./lora_output"
    except Exception as e:
        return f"❌ Error durante el entrenamiento: {e}"