Sentence-Translator / temp /modelCreator.py
VashuTheGreat2's picture
Upload folder using huggingface_hub
b758d48 verified
Raw
History Blame Contribute Delete
5.03 kB
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
import pickle
import os
class Create:
def __init__(self,data,inputCol,outputCol,epochs,batch_size):
self.data=data
self.inputCol=inputCol
self.outputCol=outputCol
self.epochs=epochs
self.batch_size=batch_size
@ staticmethod
def prep_text(s):
return " ".join(s.strip().lower().split())
def dataClean(self):
print("Cleaning the data")
filters = '"!#$%&()*+,-./:;=?@[\\]^`{|}~\t\n'
self.data[self.inputCol] = self.data[self.inputCol].apply(Create.prep_text)
self.data[self.outputCol] = self.data[self.outputCol].apply(lambda s: f"start_ {Create.prep_text(s)} _end")
self.tokenizer_e = Tokenizer(filters=filters, lower=True, oov_token=None)
self.tokenizer_f = Tokenizer(filters=filters, lower=True, oov_token=None)
# creating Two Tokenizers
self.tokenizer_e.fit_on_texts(self.data[self.inputCol])
self.tokenizer_f.fit_on_texts(self.data[self.outputCol])
# creating source and target vectors
self.src_seq = self.tokenizer_e.texts_to_sequences(self.data[self.inputCol])
self.tgt_seq = self.tokenizer_f.texts_to_sequences(self.data[self.outputCol])
# storing the max length of the sequences
self.max_len_src = max(len(s) for s in self.src_seq)
self.max_len_tgt = max(len(s) for s in self.tgt_seq)
# applying post padding to it
self.src_seq = pad_sequences(self.src_seq, maxlen=self.max_len_src, padding='post')
self.tgt_seq = pad_sequences(self.tgt_seq, maxlen=self.max_len_tgt, padding='post')
self.tgt_input = self.tgt_seq[:, :-1] # encoder input _end ko hate hue up to last -1
self.tgt_output = self.tgt_seq[:, 1:] # start_ ko hatate hue upto _end
self.vocab_src = len(self.tokenizer_e.word_index) + 1 # kyoki 1 se start hota h isiliye + 1 numbers of vocab size
self.vocab_tgt = len(self.tokenizer_f.word_index) + 1 # kyoki 1 se start hota h isiliye + 1 numbers of vocab size
def compileModel(self):
print("Compiling the model")
# Model
latent_dim = 256 # lstm nodes
# encoder
enc_inputs = Input(shape=(self.max_len_src,))
enc_emb = Embedding(self.vocab_src, 128, mask_zero=True)(enc_inputs)
_, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
enc_states = [state_h, state_c] # context vector
# decoder
dec_inputs = Input(shape=(self.max_len_tgt-1,)) # kyoki ek kam de rahe h na ham isiliye -1
dec_emb = Embedding(self.vocab_tgt, 128, mask_zero=True)(dec_inputs)
dec_outputs = LSTM(latent_dim, return_sequences=True, return_state=False)(dec_emb, initial_state=enc_states)
dec_logits = Dense(self.vocab_tgt, activation='softmax')(dec_outputs) # har output par ek probability return hogi har vocab ke liye
# creating model
self.model = Model([enc_inputs, dec_inputs], dec_logits)
# compiling it it is mandatory to use sparse_categorical_crossentropy as jo next word h wo categorical h regration nahi
self.model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
def fitModel(self):
print("Training the model")
# targets must be (batch, timesteps, 1) for sparse loss
y_sparse = np.expand_dims(self.tgt_output, -1)
# training the model
self.model.fit([self.src_seq, self.tgt_input], y_sparse, batch_size=self.batch_size, epochs=self.epochs, verbose=1)
def saveModel(self,Modelname,InputTokenizer,OutputTokenizer,Input_max_len_src,Output_max_len_tgt):
print("Saving the model")
folder = Modelname
if not os.path.exists(folder):
os.makedirs(folder)
self.model.save(f'{Modelname}/{Modelname}.keras')
with open(f'{Modelname}/{InputTokenizer}.pkl','wb') as f:
pickle.dump(self.tokenizer_e,f)
with open(f'{Modelname}/{OutputTokenizer}.pkl','wb') as f:
pickle.dump(self.tokenizer_f,f)
with open(f'{Modelname}/{Input_max_len_src}.pkl','wb') as f:
pickle.dump(self.max_len_src,f)
with open(f'{Modelname}/{Output_max_len_tgt}.pkl','wb') as f:
pickle.dump(self.max_len_tgt,f)
def CreateModel(data,firstCol,SecondCol,name,InputTokenizer,OutputTokenizer,Input_max_len_src,Output_max_len_tgt,epochs=30,batch_size=64):
model=Create(data,firstCol,SecondCol,epochs,batch_size)
model.dataClean()
model.compileModel()
model.fitModel()
model.saveModel(name,InputTokenizer,OutputTokenizer,Input_max_len_src,Output_max_len_tgt)