Andro0s's picture
Update app.py
4f41a14 verified
raw
history blame
2.04 kB
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
# ==============================
# Configuraci贸n del modelo
# ==============================
MODEL_NAME = "bigcode/starcoder"
OUTPUT_DIR = "./results"
# Cargar tokenizer y modelo
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Corregir padding token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token # usar EOS como padding
# Si prefieres agregar un token PAD nuevo:
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
# Si agregaste un token nuevo, redimensionar embeddings
# model.resize_token_embeddings(len(tokenizer))
# ==============================
# Preparar dataset
# ==============================
# Ejemplo con wikitext (reemplaza con tu dataset)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:5%]") # ejemplo peque帽o
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# ==============================
# Configuraci贸n del DataCollator
# ==============================
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
# ==============================
# Configuraci贸n del Trainer
# ==============================
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
evaluation_strategy="steps",
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
num_train_epochs=1,
save_steps=10,
save_total_limit=2,
logging_steps=5,
report_to="none",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
)
# ==============================
# Iniciar entrenamiento
# ==============================
trainer.train()