| import math | |
| similar_letters = {"й": "и", "ё": "е", "e": "е", "t": "т", "i": "l", "o": "о", "k": "к", "3": "з", "a": "а", "x": "х", "c": "с", "m": "м"} | |
| letters = "qwertyuiopasdfghjklzxcvbnmёйцукенгшщзхъфывапролджэячсмитьбю " | |
| def countwords(x): | |
| temp = {} | |
| for word in x: | |
| if word not in temp: | |
| temp[word] = 1 | |
| else: | |
| temp[word] += 1 | |
| return temp | |
| def add_dict(a, b): | |
| temp = {} | |
| for key in a: | |
| if key in b: | |
| temp[key] = a[key]+b[key] | |
| else: | |
| temp[key] = a[key] | |
| for key in b: | |
| if key not in a: | |
| temp[key] = b[key] | |
| return temp | |
| class Chatbot: | |
| def __init__(self, name = None, n: int = 1, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False): | |
| self.name = name | |
| self.letter_replace = letter_replace | |
| self.frequency_weight = frequency_weight | |
| self.div_by_len = div_by_len | |
| self.model = {} | |
| self.n = n-1 | |
| if data is not None: | |
| self.train(data) | |
| def tokenize(self, text: str, n: int = 1): | |
| preprocess = "" | |
| for x in text.lower(): | |
| if x in letters: | |
| if x in similar_letters and self.letter_replace: | |
| preprocess += similar_letters[x] | |
| else: | |
| preprocess += x | |
| else: | |
| preprocess += " " + x + " " | |
| tokens = preprocess.split() | |
| output = tokens.copy() | |
| for i in range(self.n): | |
| for num, word in enumerate(tokens[:-i]): | |
| output.append(' '.join(tokens[num:num+i])) | |
| return output | |
| def train(self, data: dict): | |
| lendata = len(data) | |
| lendata_div = 1/lendata | |
| for x in data: | |
| if data[x] not in self.model: | |
| self.model[data[x]] = {"word count": countwords(self.tokenize(x)), "probabilities": {}, "weight count": 1, "weight": 0} | |
| else: | |
| self.model[data[x]]["word count"] = add_dict(countwords(self.tokenize(x)), self.model[data[x]]["word count"]) | |
| self.model[data[x]]["weight count"] += 1 | |
| for x in self.model: | |
| probabilities = {} | |
| div = 1/math.fsum(list(self.model[x]["word count"].values())) | |
| for word in self.model[x]["word count"]: | |
| probabilities[word] = self.model[x]["word count"][word]*div | |
| self.model[x]["probabilities"] = probabilities | |
| self.model[x]["weight"] = self.model[x]["weight count"] * lendata_div | |
| def get_responses(self, text: str): | |
| tokens = self.tokenize(text) | |
| lentokens = len(tokens) | |
| lentokens_div = 1/lentokens | |
| scores = [] | |
| for choice in self.model: | |
| score = 0 | |
| for token in tokens: | |
| if token in self.model[choice]["probabilities"]: | |
| score += self.model[choice]["probabilities"][token] | |
| if self.div_by_len: | |
| score *= lentokens_div | |
| score *= self.frequency_weight*self.model[choice]["weight"] + (1-self.frequency_weight) | |
| scores.append((choice, score)) | |
| return sorted(scores, key=lambda x: x[1], reverse=True) | |
| def __call__(self, text: str): | |
| return self.get_responses(text)[0][0] | |
| if __name__ == "__main__": | |
| import json | |
| with open("dataset.json", "r") as file: | |
| data = json.load(file) | |
| cb = Chatbot(data=data) | |
| while True: | |
| message = input("User: ") | |
| response = cb(message) | |
| print("Chatbot:", response) | |
| if response == "Пока": | |
| break | |