| from datasets import load_dataset |
| from transformers import GPT2LMHeadModel, GPT2Tokenizer |
|
|
| tokenizer = GPT2Tokenizer.from_pretrained('gpt2') |
| model = GPT2LMHeadModel.from_pretrained('gpt2') |
|
|
| examples = load_dataset("Andyrasika/Ecommerce_FAQ") |
|
|
| def tokenize_function(examples): |
| |
| text_pairs = [f"Q: {q} A: {a}" for q, a in zip(examples['question'], examples['answer'])] |
| return tokenizer(text_pairs, truncation=True, padding=True) |
|
|
|
|
| |
| |
|
|
| tokenized_dataset = dataset.map(tokenize_function, batched=True) |
|
|
| from transformers import Trainer, TrainingArguments |
|
|
| training_args = TrainingArguments( |
| output_dir='./results', |
| num_train_epochs=3, |
| per_device_train_batch_size=4, |
| save_steps=10_000, |
| save_total_limit=2, |
| ) |
|
|
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=tokenized_dataset, |
| ) |
|
|
|
|
| trainer.train() |
|
|
|
|
| model.save_pretrained('./faqmodel') |
| tokenizer.save_pretrained('./faqmodel') |
|
|
|
|
|
|
|
|
|
|
|
|