Spaces:

jithenderchoudary
/

Chatbot2

Runtime error

App Files Files Community

jithenderchoudary commited on May 6, 2025

Commit

1ed2d91

verified ·

1 Parent(s): a73d81f

Create model.py

Browse files

Files changed (1) hide show

model.py +85 -0

model.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import json
+import torch
+from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+import torch.nn as nn
+from torch.utils.data import Dataset
+# Load the data from intents.json
+with open("data/intents.json") as file:
+    intents_data = json.load(file)
+# Initialize the tokenizer and model
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set([intent["tag"] for intent in intents_data["intents"]])))
+# Prepare the data: tokenize and encode the text
+train_data = []
+train_labels = []
+for intent in intents_data["intents"]:
+    for pattern in intent["patterns"]:
+        # Tokenize the input text
+        encoded_input = tokenizer(pattern, padding=True, truncation=True, return_tensors="pt")
+        train_data.append(encoded_input)
+        train_labels.append(intent["tag"])
+# Encode the labels (e.g., "greeting", "goodbye") to numeric values
+label_encoder = LabelEncoder()
+train_labels_encoded = label_encoder.fit_transform(train_labels)
+# Split the data into training and testing sets
+train_data, test_data, train_labels, test_labels = train_test_split(train_data, train_labels_encoded, test_size=0.2)
+# Create a custom dataset class for PyTorch
+class ChatbotDataset(Dataset):
+    def __init__(self, data, labels):
+        self.data = data
+        self.labels = labels
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return {
+            'input_ids': self.data[idx]['input_ids'].squeeze(),
+            'attention_mask': self.data[idx]['attention_mask'].squeeze(),
+            'labels': torch.tensor(self.labels[idx])
+        }
+train_dataset = ChatbotDataset(train_data, train_labels)
+test_dataset = ChatbotDataset(test_data, test_labels)
+# Training setup
+training_args = TrainingArguments(
+    output_dir="./results",
+    num_train_epochs=3,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+    logging_dir="./logs",
+    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
+    save_strategy="epoch",       # Save the model at the end of each epoch
+)
+# Initialize the Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=test_dataset,
+)
+# Train the model
+trainer.train()
+# Save the trained model and tokenizer
+model.save_pretrained("./results")
+tokenizer.save_pretrained("./results")
+# Save the label encoder for future inference
+import pickle
+with open('label_encoder.pkl', 'wb') as f:
+    pickle.dump(label_encoder, f)
+print("Training complete. Model and tokenizer saved.")