| | |
| | from tensorflow.keras.models import Model |
| | from tensorflow.keras import models |
| | from tensorflow.keras.utils import plot_model |
| | from tensorflow.keras.layers import Input,LSTM,Dense |
| | from sklearn.feature_extraction.text import CountVectorizer |
| | import numpy as np |
| | import pickle |
| |
|
| | |
| | input_texts=[] |
| | target_texts=[] |
| | input_characters=set() |
| | target_characters=set() |
| |
|
| | |
| | with open('eng-french.txt','r',encoding='utf-8') as f: |
| | rows=f.read().split('\n') |
| | |
| | for row in rows[:10000]: |
| | |
| | input_text,target_text = row.split('\t') |
| | |
| | target_text='\t' + target_text + '\n' |
| | input_texts.append(input_text.lower()) |
| | target_texts.append(target_text.lower()) |
| | |
| | input_characters.update(list(input_text.lower())) |
| | target_characters.update(list(target_text.lower())) |
| |
|
| | |
| | input_characters = sorted(list(input_characters)) |
| | target_characters = sorted(list(target_characters)) |
| | |
| | num_en_chars = len(input_characters) |
| | num_dec_chars = len(target_characters) |
| | |
| | max_input_length = max([len(i) for i in input_texts]) |
| | max_target_length = max([len(i) for i in target_texts]) |
| |
|
| | def bagofcharacters(input_texts,target_texts): |
| | |
| | en_in_data=[] ; dec_in_data=[] ; dec_tr_data=[] |
| | |
| | pad_en=[1]+[0]*(len(input_characters)-1) |
| | pad_dec=[0]*(len(target_characters)) ; pad_dec[2]=1 |
| | |
| | |
| | cv=CountVectorizer(binary=True,tokenizer=lambda txt: txt.split(),stop_words=None,analyzer='char') |
| | for i,(input_t,target_t) in enumerate(zip(input_texts,target_texts)): |
| | |
| | cv_inp= cv.fit(input_characters) |
| | |
| | |
| | |
| | en_in_data.append(cv_inp.transform(list(input_t)).toarray().tolist()) |
| | cv_tar= cv.fit(target_characters) |
| | dec_in_data.append(cv_tar.transform(list(target_t)).toarray().tolist()) |
| | |
| | |
| | dec_tr_data.append(cv_tar.transform(list(target_t)[1:]).toarray().tolist()) |
| | |
| | |
| | |
| | if len(input_t) < max_input_length: |
| | for _ in range(max_input_length-len(input_t)): |
| | en_in_data[i].append(pad_en) |
| | if len(target_t) < max_target_length: |
| | for _ in range(max_target_length-len(target_t)): |
| | dec_in_data[i].append(pad_dec) |
| | if (len(target_t)-1) < max_target_length: |
| | for _ in range(max_target_length-len(target_t)+1): |
| | dec_tr_data[i].append(pad_dec) |
| | |
| | |
| | en_in_data=np.array(en_in_data,dtype="float32") |
| | dec_in_data=np.array(dec_in_data,dtype="float32") |
| | dec_tr_data=np.array(dec_tr_data,dtype="float32") |
| |
|
| | return en_in_data,dec_in_data,dec_tr_data |
| |
|
| | |
| | en_inputs = Input(shape=(None, num_en_chars)) |
| | |
| | |
| | encoder = LSTM(256, return_state=True) |
| | |
| | en_outputs, state_h, state_c = encoder(en_inputs) |
| | en_states = [state_h, state_c] |
| |
|
| | |
| | dec_inputs = Input(shape=(None, num_dec_chars)) |
| | |
| | |
| | dec_lstm = LSTM(256, return_sequences=True, return_state=True) |
| | |
| | dec_outputs, _, _ = dec_lstm(dec_inputs, initial_state=en_states) |
| | |
| | dec_dense = Dense(num_dec_chars, activation="softmax") |
| | dec_outputs = dec_dense(dec_outputs) |
| |
|
| | |
| | model = Model([en_inputs, dec_inputs], dec_outputs) |
| | pickle.dump({'input_characters':input_characters,'target_characters':target_characters, |
| | 'max_input_length':max_input_length,'max_target_length':max_target_length, |
| | 'num_en_chars':num_en_chars,'num_dec_chars':num_dec_chars},open("training_data.pkl","wb")) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | model.save("s2s")cd |
| | |
| | model.summary() |
| | plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True) |
| |
|
| |
|