Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling | |
| from datasets import load_dataset, Dataset, concatenate_datasets | |
| import numpy as np | |
| from typing import Dict, List | |
| import os | |
| import json | |
| def load_and_prepare_data(): | |
| # Load the base customer support dataset | |
| base_dataset = load_dataset("Victorano/customer-support-1k") | |
| # Load custom car service data | |
| car_service_data = [] | |
| with open('car_service_data.jsonl', 'r') as f: | |
| for line in f: | |
| car_service_data.append(json.loads(line)) | |
| # Convert car service data to the same format as the base dataset | |
| car_service_dataset = Dataset.from_list(car_service_data) | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") | |
| # Function to format conversations | |
| def format_conversation(example): | |
| if 'question' in example and 'answer' in example: | |
| # Format for base dataset | |
| conversation = f"Customer: {example['question']}\nSupport: {example['answer']}" | |
| else: | |
| # Format for car service data | |
| conversation = f"Customer: {example['customer_query']}\nSupport: {example['support_response']}" | |
| return {"text": conversation} | |
| # Apply formatting to both datasets | |
| formatted_base_dataset = base_dataset.map( | |
| format_conversation, | |
| remove_columns=base_dataset["train"].column_names | |
| ) | |
| formatted_car_dataset = car_service_dataset.map( | |
| format_conversation, | |
| remove_columns=car_service_dataset.column_names | |
| ) | |
| # Combine datasets | |
| combined_train = concatenate_datasets([formatted_base_dataset["train"], formatted_car_dataset]) | |
| combined_test = concatenate_datasets([formatted_base_dataset["test"], formatted_car_dataset]) | |
| # Tokenize the dataset | |
| def tokenize_function(examples): | |
| return tokenizer( | |
| examples["text"], | |
| padding="max_length", | |
| truncation=True, | |
| max_length=512, | |
| return_tensors="pt" | |
| ) | |
| tokenized_dataset = { | |
| "train": combined_train.map( | |
| tokenize_function, | |
| batched=True, | |
| remove_columns=combined_train.column_names | |
| ), | |
| "test": combined_test.map( | |
| tokenize_function, | |
| batched=True, | |
| remove_columns=combined_test.column_names | |
| ) | |
| } | |
| return tokenized_dataset, tokenizer | |
| def train_model(): | |
| # Load and prepare data | |
| tokenized_dataset, tokenizer = load_and_prepare_data() | |
| # Load model | |
| model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium") | |
| # Define training arguments | |
| training_args = TrainingArguments( | |
| output_dir="./customer_support_chatbot", | |
| num_train_epochs=3, | |
| per_device_train_batch_size=4, | |
| per_device_eval_batch_size=4, | |
| warmup_steps=500, | |
| weight_decay=0.01, | |
| logging_dir="./logs", | |
| logging_steps=100, | |
| save_strategy="epoch", | |
| evaluation_strategy="epoch", | |
| load_best_model_at_end=True, | |
| push_to_hub=False, | |
| ) | |
| # Initialize trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset["train"], | |
| eval_dataset=tokenized_dataset["test"], | |
| data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), | |
| ) | |
| # Train the model | |
| trainer.train() | |
| # Save the model and tokenizer | |
| model.save_pretrained("./customer_support_chatbot") | |
| tokenizer.save_pretrained("./customer_support_chatbot") | |
| print("Training completed! Model saved to ./customer_support_chatbot") | |
| if __name__ == "__main__": | |
| train_model() |