| |
| |
| |
| from datasets import load_dataset |
| eli5 = load_dataset("dany0407/eli5_category", split="train[:5000]") |
|
|
| |
| |
| |
| eli5 = eli5.train_test_split(test_size=0.2) |
|
|
| |
| |
| |
| from transformers import AutoTokenizer |
| tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2") |
|
|
| |
| |
| |
| eli5 = eli5.flatten() |
|
|
| |
| |
| |
| def preprocess_function(examples): |
| """Junta todas as respostas em uma string e tokeniza.""" |
| return tokenizer( |
| [" ".join(x) for x in examples["answers.text"]], |
| truncation=True, |
| max_length=1024, |
| ) |
|
|
| |
| tokenized_eli5 = eli5.map( |
| preprocess_function, |
| batched=True, |
| num_proc=4, |
| remove_columns=eli5["train"].column_names, |
| ) |
|
|
| print("✅ Tokenização concluída!") |
| print(tokenized_eli5) |
|
|
| |
| |
| |
| block_size = 128 |
|
|
| def group_texts(examples): |
| """Concatena textos e divide em blocos de tamanho fixo.""" |
| |
| concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} |
| total_length = len(concatenated_examples[list(examples.keys())[0]]) |
| |
| |
| if total_length >= block_size: |
| total_length = (total_length // block_size) * block_size |
| |
| |
| result = { |
| k: [t[i : i + block_size] for i in range(0, total_length, block_size)] |
| for k, t in concatenated_examples.items() |
| } |
| |
| |
| result["labels"] = result["input_ids"].copy() |
| return result |
|
|
| lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4) |
| print(lm_dataset) |
|
|
| from transformers import DataCollatorForLanguageModeling |
|
|
| tokenizer.pad_token = tokenizer.eos_token |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
|
|
|
| |
| |
| |
| from transformers import AutoModelForCausalLM |
|
|
| model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2") |
|
|
| |
| |
| |
| from transformers import TrainingArguments, Trainer |
|
|
| training_args = TrainingArguments( |
| output_dir="./gpt2-eli5-finetuned-by-yvens", |
| |
| |
| num_train_epochs=3, |
| per_device_train_batch_size=8, |
| per_device_eval_batch_size=8, |
| |
| |
| learning_rate=2e-5, |
| weight_decay=0.01, |
| warmup_steps=500, |
| |
| |
| eval_strategy="epoch", |
| save_strategy="epoch", |
| load_best_model_at_end=True, |
| |
| |
| logging_steps=100, |
| |
| |
| |
| |
| ) |
|
|
| |
| |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=lm_dataset["train"], |
| eval_dataset=lm_dataset["test"], |
| data_collator=data_collator, |
| processing_class=tokenizer, |
| ) |
|
|
| |
| |
| |
| trainer.train() |
|
|
| |
| |
| |
| trainer.save_model("./gpt2-eli5-final-by-Yvens") |
| tokenizer.save_pretrained("./gpt2-eli5-final-by-Yvens-Yan") |
|
|
| print("✅ Treinamento concluído!") |