File size: 2,041 Bytes
7b02281
a7bd5fa
4f41a14
7b02281
a7bd5fa
4f41a14
 
 
 
 
a7bd5fa
4f41a14
a7bd5fa
 
4f41a14
 
 
 
 
a7bd5fa
4f41a14
7b02281
4f41a14
 
7b02281
4f41a14
 
 
 
 
37edf61
7b02281
4f41a14
7b02281
4f41a14
7b02281
4f41a14
 
 
 
a7bd5fa
4f41a14
 
 
7b02281
4f41a14
 
 
 
 
 
 
 
 
a7bd5fa
 
7b02281
 
 
4f41a14
 
 
 
7b02281
 
4f41a14
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset

# ==============================
# Configuraci贸n del modelo
# ==============================
MODEL_NAME = "bigcode/starcoder"
OUTPUT_DIR = "./results"

# Cargar tokenizer y modelo
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Corregir padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # usar EOS como padding
    # Si prefieres agregar un token PAD nuevo:
    # tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Si agregaste un token nuevo, redimensionar embeddings
# model.resize_token_embeddings(len(tokenizer))

# ==============================
# Preparar dataset
# ==============================
# Ejemplo con wikitext (reemplaza con tu dataset)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:5%]")  # ejemplo peque帽o

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# ==============================
# Configuraci贸n del DataCollator
# ==============================
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

# ==============================
# Configuraci贸n del Trainer
# ==============================
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="steps",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    save_steps=10,
    save_total_limit=2,
    logging_steps=5,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ==============================
# Iniciar entrenamiento
# ==============================
trainer.train()