import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding from datasets import load_dataset # ============================== # Configuración del modelo # ============================== MODEL_NAME = "bigcode/starcoder" OUTPUT_DIR = "./results" # Cargar tokenizer y modelo tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Corregir padding token if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # usar EOS como padding # Si prefieres agregar un token PAD nuevo: # tokenizer.add_special_tokens({'pad_token': '[PAD]'}) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) # Si agregaste un token nuevo, redimensionar embeddings # model.resize_token_embeddings(len(tokenizer)) # ============================== # Preparar dataset # ============================== # Ejemplo con wikitext (reemplaza con tu dataset) dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:5%]") # ejemplo pequeño def tokenize_function(examples): return tokenizer(examples["text"], truncation=True) tokenized_dataset = dataset.map(tokenize_function, batched=True) # ============================== # Configuración del DataCollator # ============================== data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True) # ============================== # Configuración del Trainer # ============================== training_args = TrainingArguments( output_dir=OUTPUT_DIR, evaluation_strategy="steps", per_device_train_batch_size=2, per_device_eval_batch_size=2, num_train_epochs=1, save_steps=10, save_total_limit=2, logging_steps=5, report_to="none", ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, eval_dataset=tokenized_dataset, tokenizer=tokenizer, data_collator=data_collator, ) # ============================== # Iniciar entrenamiento # ============================== trainer.train()