Add n-gram update
Browse files
main.py
CHANGED
|
@@ -25,15 +25,16 @@ def add_dict(a, b):
|
|
| 25 |
return temp
|
| 26 |
|
| 27 |
class Chatbot:
|
| 28 |
-
def __init__(self, name=None, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False):
|
| 29 |
self.name = name
|
| 30 |
self.letter_replace = letter_replace
|
| 31 |
self.frequency_weight = frequency_weight
|
| 32 |
self.div_by_len = div_by_len
|
| 33 |
self.model = {}
|
|
|
|
| 34 |
if data is not None:
|
| 35 |
self.train(data)
|
| 36 |
-
def tokenize(self, text: str):
|
| 37 |
preprocess = ""
|
| 38 |
for x in text.lower():
|
| 39 |
if x in letters:
|
|
@@ -42,8 +43,13 @@ class Chatbot:
|
|
| 42 |
else:
|
| 43 |
preprocess += x
|
| 44 |
else:
|
| 45 |
-
preprocess += " "+x+" "
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
def train(self, data: dict):
|
| 48 |
lendata = len(data)
|
| 49 |
lendata_div = 1/lendata
|
|
|
|
| 25 |
return temp
|
| 26 |
|
| 27 |
class Chatbot:
|
| 28 |
+
def __init__(self, name = None, n: int = 1, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False):
|
| 29 |
self.name = name
|
| 30 |
self.letter_replace = letter_replace
|
| 31 |
self.frequency_weight = frequency_weight
|
| 32 |
self.div_by_len = div_by_len
|
| 33 |
self.model = {}
|
| 34 |
+
self.n = n-1
|
| 35 |
if data is not None:
|
| 36 |
self.train(data)
|
| 37 |
+
def tokenize(self, text: str, n: int = 1):
|
| 38 |
preprocess = ""
|
| 39 |
for x in text.lower():
|
| 40 |
if x in letters:
|
|
|
|
| 43 |
else:
|
| 44 |
preprocess += x
|
| 45 |
else:
|
| 46 |
+
preprocess += " " + x + " "
|
| 47 |
+
tokens = preprocess.split()
|
| 48 |
+
output = tokens.copy()
|
| 49 |
+
for i in range(self.n):
|
| 50 |
+
for num, word in enumerate(tokens[:-i]):
|
| 51 |
+
output.append(' '.join(tokens[num:num+i]))
|
| 52 |
+
return output
|
| 53 |
def train(self, data: dict):
|
| 54 |
lendata = len(data)
|
| 55 |
lendata_div = 1/lendata
|