Spaces:
Runtime error
Runtime error
File size: 2,041 Bytes
7b02281 a7bd5fa 4f41a14 7b02281 a7bd5fa 4f41a14 a7bd5fa 4f41a14 a7bd5fa 4f41a14 a7bd5fa 4f41a14 7b02281 4f41a14 7b02281 4f41a14 37edf61 7b02281 4f41a14 7b02281 4f41a14 7b02281 4f41a14 a7bd5fa 4f41a14 7b02281 4f41a14 a7bd5fa 7b02281 4f41a14 7b02281 4f41a14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
# ==============================
# Configuraci贸n del modelo
# ==============================
MODEL_NAME = "bigcode/starcoder"
OUTPUT_DIR = "./results"
# Cargar tokenizer y modelo
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Corregir padding token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token # usar EOS como padding
# Si prefieres agregar un token PAD nuevo:
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
# Si agregaste un token nuevo, redimensionar embeddings
# model.resize_token_embeddings(len(tokenizer))
# ==============================
# Preparar dataset
# ==============================
# Ejemplo con wikitext (reemplaza con tu dataset)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:5%]") # ejemplo peque帽o
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# ==============================
# Configuraci贸n del DataCollator
# ==============================
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
# ==============================
# Configuraci贸n del Trainer
# ==============================
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
evaluation_strategy="steps",
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
num_train_epochs=1,
save_steps=10,
save_total_limit=2,
logging_steps=5,
report_to="none",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
)
# ==============================
# Iniciar entrenamiento
# ==============================
trainer.train() |