ierhon
/

neural-chatbot

Text Generation

Model card Files Files and versions

ierhon commited on Jul 29, 2023

Commit

23f7f3b

·

1 Parent(s): 6d37d8c

Use new dataset.json

Files changed (1) hide show

train.py +6 -4

train.py CHANGED Viewed

@@ -8,10 +8,12 @@ from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
 from model_settings import *
-with open("dataset.json", "r") as f: # TODO: move the outputs into a separate file, so it would be "key": 0, "key2": 1 etc
     dset = json.load(f)
-dset_size = len(dset)
 tokenizer = Tokenizer() # a tokenizer is a thing to split text into words, it might have some other stuff like making all the letters lowercase, etc.
 tokenizer.fit_on_texts(list(dset.keys()))
@@ -31,11 +33,11 @@ model.add(Dense(dset_size, activation="linear")) # TBH it doesn't matter that mu
 X = [] # we're loading the training data into input X
 y = [] # and output y
-for line, key in enumerate(dset):
     tokens = tokenizer.texts_to_sequences([key,])[0]
     X.append(np.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code
     output_array = np.zeros(dset_size)
-    output_array[line] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response
     y.append(output_array)
 X = np.array(X) # normal lists are way slower than numpy arrays (remember, a list and an array is not the same thing, an array is far more limited)

 from model_settings import *
+with open("dataset.json", "r") as f:
     dset = json.load(f)
+with open("responses.txt", "r") as f:
+    dset_size = len(f.readlines())
 tokenizer = Tokenizer() # a tokenizer is a thing to split text into words, it might have some other stuff like making all the letters lowercase, etc.
 tokenizer.fit_on_texts(list(dset.keys()))
 X = [] # we're loading the training data into input X
 y = [] # and output y
+for key in dset:
     tokens = tokenizer.texts_to_sequences([key,])[0]
     X.append(np.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code
     output_array = np.zeros(dset_size)
+    output_array[dset[key]] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response
     y.append(output_array)
 X = np.array(X) # normal lists are way slower than numpy arrays (remember, a list and an array is not the same thing, an array is far more limited)