Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling | |
| from datasets import load_dataset | |
| import numpy as np | |
| from typing import Dict, List | |
| import os | |
| def load_and_prepare_data(): | |
| # Load the dataset | |
| dataset = load_dataset("Victorano/customer-support-1k") | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") | |
| # Function to format conversations | |
| def format_conversation(example): | |
| # Combine question and answer into a single conversation | |
| conversation = f"Customer: {example['question']}\nSupport: {example['answer']}" | |
| return {"text": conversation} | |
| # Apply formatting to both train and test sets | |
| formatted_dataset = dataset.map( | |
| format_conversation, | |
| remove_columns=dataset["train"].column_names | |
| ) | |
| # Tokenize the dataset | |
| def tokenize_function(examples): | |
| return tokenizer( | |
| examples["text"], | |
| padding="max_length", | |
| truncation=True, | |
| max_length=512, | |
| return_tensors="pt" | |
| ) | |
| tokenized_dataset = formatted_dataset.map( | |
| tokenize_function, | |
| batched=True, | |
| remove_columns=formatted_dataset["train"].column_names | |
| ) | |
| return tokenized_dataset, tokenizer | |
| def train_model(): | |
| # Load and prepare data | |
| tokenized_dataset, tokenizer = load_and_prepare_data() | |
| # Load model | |
| model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium") | |
| # Define training arguments | |
| training_args = TrainingArguments( | |
| output_dir="./customer_support_chatbot", | |
| num_train_epochs=3, | |
| per_device_train_batch_size=4, | |
| per_device_eval_batch_size=4, | |
| warmup_steps=500, | |
| weight_decay=0.01, | |
| logging_dir="./logs", | |
| logging_steps=100, | |
| save_strategy="epoch", | |
| evaluation_strategy="epoch", | |
| load_best_model_at_end=True, | |
| push_to_hub=False, | |
| ) | |
| # Initialize trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset["train"], | |
| eval_dataset=tokenized_dataset["test"], | |
| data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), | |
| ) | |
| # Train the model | |
| trainer.train() | |
| # Save the model and tokenizer | |
| model.save_pretrained("./customer_support_chatbot") | |
| tokenizer.save_pretrained("./customer_support_chatbot") | |
| print("Training completed! Model saved to ./customer_support_chatbot") | |
| if __name__ == "__main__": | |
| train_model() |