jithenderchoudary commited on
Commit
1ed2d91
·
verified ·
1 Parent(s): a73d81f

Create model.py

Browse files
Files changed (1) hide show
  1. model.py +85 -0
model.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+ from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.preprocessing import LabelEncoder
6
+ import torch.nn as nn
7
+ from torch.utils.data import Dataset
8
+
9
+ # Load the data from intents.json
10
+ with open("data/intents.json") as file:
11
+ intents_data = json.load(file)
12
+
13
+ # Initialize the tokenizer and model
14
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
15
+ model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set([intent["tag"] for intent in intents_data["intents"]])))
16
+
17
+ # Prepare the data: tokenize and encode the text
18
+ train_data = []
19
+ train_labels = []
20
+
21
+ for intent in intents_data["intents"]:
22
+ for pattern in intent["patterns"]:
23
+ # Tokenize the input text
24
+ encoded_input = tokenizer(pattern, padding=True, truncation=True, return_tensors="pt")
25
+ train_data.append(encoded_input)
26
+ train_labels.append(intent["tag"])
27
+
28
+ # Encode the labels (e.g., "greeting", "goodbye") to numeric values
29
+ label_encoder = LabelEncoder()
30
+ train_labels_encoded = label_encoder.fit_transform(train_labels)
31
+
32
+ # Split the data into training and testing sets
33
+ train_data, test_data, train_labels, test_labels = train_test_split(train_data, train_labels_encoded, test_size=0.2)
34
+
35
+ # Create a custom dataset class for PyTorch
36
+ class ChatbotDataset(Dataset):
37
+ def __init__(self, data, labels):
38
+ self.data = data
39
+ self.labels = labels
40
+
41
+ def __len__(self):
42
+ return len(self.data)
43
+
44
+ def __getitem__(self, idx):
45
+ return {
46
+ 'input_ids': self.data[idx]['input_ids'].squeeze(),
47
+ 'attention_mask': self.data[idx]['attention_mask'].squeeze(),
48
+ 'labels': torch.tensor(self.labels[idx])
49
+ }
50
+
51
+ train_dataset = ChatbotDataset(train_data, train_labels)
52
+ test_dataset = ChatbotDataset(test_data, test_labels)
53
+
54
+ # Training setup
55
+ training_args = TrainingArguments(
56
+ output_dir="./results",
57
+ num_train_epochs=3,
58
+ per_device_train_batch_size=8,
59
+ per_device_eval_batch_size=8,
60
+ logging_dir="./logs",
61
+ evaluation_strategy="epoch", # Evaluate at the end of each epoch
62
+ save_strategy="epoch", # Save the model at the end of each epoch
63
+ )
64
+
65
+ # Initialize the Trainer
66
+ trainer = Trainer(
67
+ model=model,
68
+ args=training_args,
69
+ train_dataset=train_dataset,
70
+ eval_dataset=test_dataset,
71
+ )
72
+
73
+ # Train the model
74
+ trainer.train()
75
+
76
+ # Save the trained model and tokenizer
77
+ model.save_pretrained("./results")
78
+ tokenizer.save_pretrained("./results")
79
+
80
+ # Save the label encoder for future inference
81
+ import pickle
82
+ with open('label_encoder.pkl', 'wb') as f:
83
+ pickle.dump(label_encoder, f)
84
+
85
+ print("Training complete. Model and tokenizer saved.")