File size: 2,676 Bytes
0680865
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
import numpy as np
from typing import Dict, List
import os

def load_and_prepare_data():
    # Load the dataset
    dataset = load_dataset("Victorano/customer-support-1k")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
    
    # Function to format conversations
    def format_conversation(example):
        # Combine question and answer into a single conversation
        conversation = f"Customer: {example['question']}\nSupport: {example['answer']}"
        return {"text": conversation}
    
    # Apply formatting to both train and test sets
    formatted_dataset = dataset.map(
        format_conversation,
        remove_columns=dataset["train"].column_names
    )
    
    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
    
    tokenized_dataset = formatted_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=formatted_dataset["train"].column_names
    )
    
    return tokenized_dataset, tokenizer

def train_model():
    # Load and prepare data
    tokenized_dataset, tokenizer = load_and_prepare_data()
    
    # Load model
    model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./customer_support_chatbot",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=100,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    )
    
    # Train the model
    trainer.train()
    
    # Save the model and tokenizer
    model.save_pretrained("./customer_support_chatbot")
    tokenizer.save_pretrained("./customer_support_chatbot")
    
    print("Training completed! Model saved to ./customer_support_chatbot")

if __name__ == "__main__":
    train_model()