Spaces:
Runtime error
Runtime error
| import json | |
| import torch | |
| from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder | |
| import torch.nn as nn | |
| from torch.utils.data import Dataset | |
| # Load the data from intents.json | |
| with open("data/intents.json") as file: | |
| intents_data = json.load(file) | |
| # Initialize the tokenizer and model | |
| tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
| model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set([intent["tag"] for intent in intents_data["intents"]]))) | |
| # Prepare the data: tokenize and encode the text | |
| train_data = [] | |
| train_labels = [] | |
| for intent in intents_data["intents"]: | |
| for pattern in intent["patterns"]: | |
| # Tokenize the input text | |
| encoded_input = tokenizer(pattern, padding=True, truncation=True, return_tensors="pt") | |
| train_data.append(encoded_input) | |
| train_labels.append(intent["tag"]) | |
| # Encode the labels (e.g., "greeting", "goodbye") to numeric values | |
| label_encoder = LabelEncoder() | |
| train_labels_encoded = label_encoder.fit_transform(train_labels) | |
| # Split the data into training and testing sets | |
| train_data, test_data, train_labels, test_labels = train_test_split(train_data, train_labels_encoded, test_size=0.2) | |
| # Create a custom dataset class for PyTorch | |
| class ChatbotDataset(Dataset): | |
| def __init__(self, data, labels): | |
| self.data = data | |
| self.labels = labels | |
| def __len__(self): | |
| return len(self.data) | |
| def __getitem__(self, idx): | |
| return { | |
| 'input_ids': self.data[idx]['input_ids'].squeeze(), | |
| 'attention_mask': self.data[idx]['attention_mask'].squeeze(), | |
| 'labels': torch.tensor(self.labels[idx]) | |
| } | |
| train_dataset = ChatbotDataset(train_data, train_labels) | |
| test_dataset = ChatbotDataset(test_data, test_labels) | |
| # Training setup | |
| training_args = TrainingArguments( | |
| output_dir="./results", | |
| num_train_epochs=3, | |
| per_device_train_batch_size=8, | |
| per_device_eval_batch_size=8, | |
| logging_dir="./logs", | |
| evaluation_strategy="epoch", # Evaluate at the end of each epoch | |
| save_strategy="epoch", # Save the model at the end of each epoch | |
| ) | |
| # Initialize the Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=test_dataset, | |
| ) | |
| # Train the model | |
| trainer.train() | |
| # Save the trained model and tokenizer | |
| model.save_pretrained("./results") | |
| tokenizer.save_pretrained("./results") | |
| # Save the label encoder for future inference | |
| import pickle | |
| with open('label_encoder.pkl', 'wb') as f: | |
| pickle.dump(label_encoder, f) | |
| print("Training complete. Model and tokenizer saved.") | |