Spaces:
Runtime error
Runtime error
| import os | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding | |
| from datasets import load_dataset | |
| # ============================== | |
| # Configuraci贸n del modelo | |
| # ============================== | |
| MODEL_NAME = "bigcode/starcoder" | |
| OUTPUT_DIR = "./results" | |
| # Cargar tokenizer y modelo | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| # Corregir padding token | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token # usar EOS como padding | |
| # Si prefieres agregar un token PAD nuevo: | |
| # tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) | |
| # Si agregaste un token nuevo, redimensionar embeddings | |
| # model.resize_token_embeddings(len(tokenizer)) | |
| # ============================== | |
| # Preparar dataset | |
| # ============================== | |
| # Ejemplo con wikitext (reemplaza con tu dataset) | |
| dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:5%]") # ejemplo peque帽o | |
| def tokenize_function(examples): | |
| return tokenizer(examples["text"], truncation=True) | |
| tokenized_dataset = dataset.map(tokenize_function, batched=True) | |
| # ============================== | |
| # Configuraci贸n del DataCollator | |
| # ============================== | |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True) | |
| # ============================== | |
| # Configuraci贸n del Trainer | |
| # ============================== | |
| training_args = TrainingArguments( | |
| output_dir=OUTPUT_DIR, | |
| evaluation_strategy="steps", | |
| per_device_train_batch_size=2, | |
| per_device_eval_batch_size=2, | |
| num_train_epochs=1, | |
| save_steps=10, | |
| save_total_limit=2, | |
| logging_steps=5, | |
| report_to="none", | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset, | |
| eval_dataset=tokenized_dataset, | |
| tokenizer=tokenizer, | |
| data_collator=data_collator, | |
| ) | |
| # ============================== | |
| # Iniciar entrenamiento | |
| # ============================== | |
| trainer.train() |