Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import pandas as pd | |
| from tensorflow.keras.preprocessing.text import Tokenizer | |
| from tensorflow.keras.preprocessing.sequence import pad_sequences | |
| from tensorflow.keras.layers import Input, Embedding, LSTM, Dense | |
| from tensorflow.keras.models import Model | |
| import pickle | |
| import os | |
| class Create: | |
| def __init__(self,data,inputCol,outputCol,epochs,batch_size): | |
| self.data=data | |
| self.inputCol=inputCol | |
| self.outputCol=outputCol | |
| self.epochs=epochs | |
| self.batch_size=batch_size | |
| def prep_text(s): | |
| return " ".join(s.strip().lower().split()) | |
| def dataClean(self): | |
| print("Cleaning the data") | |
| filters = '"!#$%&()*+,-./:;=?@[\\]^`{|}~\t\n' | |
| self.data[self.inputCol] = self.data[self.inputCol].apply(Create.prep_text) | |
| self.data[self.outputCol] = self.data[self.outputCol].apply(lambda s: f"start_ {Create.prep_text(s)} _end") | |
| self.tokenizer_e = Tokenizer(filters=filters, lower=True, oov_token=None) | |
| self.tokenizer_f = Tokenizer(filters=filters, lower=True, oov_token=None) | |
| # creating Two Tokenizers | |
| self.tokenizer_e.fit_on_texts(self.data[self.inputCol]) | |
| self.tokenizer_f.fit_on_texts(self.data[self.outputCol]) | |
| # creating source and target vectors | |
| self.src_seq = self.tokenizer_e.texts_to_sequences(self.data[self.inputCol]) | |
| self.tgt_seq = self.tokenizer_f.texts_to_sequences(self.data[self.outputCol]) | |
| # storing the max length of the sequences | |
| self.max_len_src = max(len(s) for s in self.src_seq) | |
| self.max_len_tgt = max(len(s) for s in self.tgt_seq) | |
| # applying post padding to it | |
| self.src_seq = pad_sequences(self.src_seq, maxlen=self.max_len_src, padding='post') | |
| self.tgt_seq = pad_sequences(self.tgt_seq, maxlen=self.max_len_tgt, padding='post') | |
| self.tgt_input = self.tgt_seq[:, :-1] # encoder input _end ko hate hue up to last -1 | |
| self.tgt_output = self.tgt_seq[:, 1:] # start_ ko hatate hue upto _end | |
| self.vocab_src = len(self.tokenizer_e.word_index) + 1 # kyoki 1 se start hota h isiliye + 1 numbers of vocab size | |
| self.vocab_tgt = len(self.tokenizer_f.word_index) + 1 # kyoki 1 se start hota h isiliye + 1 numbers of vocab size | |
| def compileModel(self): | |
| print("Compiling the model") | |
| # Model | |
| latent_dim = 256 # lstm nodes | |
| # encoder | |
| enc_inputs = Input(shape=(self.max_len_src,)) | |
| enc_emb = Embedding(self.vocab_src, 128, mask_zero=True)(enc_inputs) | |
| _, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb) | |
| enc_states = [state_h, state_c] # context vector | |
| # decoder | |
| dec_inputs = Input(shape=(self.max_len_tgt-1,)) # kyoki ek kam de rahe h na ham isiliye -1 | |
| dec_emb = Embedding(self.vocab_tgt, 128, mask_zero=True)(dec_inputs) | |
| dec_outputs = LSTM(latent_dim, return_sequences=True, return_state=False)(dec_emb, initial_state=enc_states) | |
| dec_logits = Dense(self.vocab_tgt, activation='softmax')(dec_outputs) # har output par ek probability return hogi har vocab ke liye | |
| # creating model | |
| self.model = Model([enc_inputs, dec_inputs], dec_logits) | |
| # compiling it it is mandatory to use sparse_categorical_crossentropy as jo next word h wo categorical h regration nahi | |
| self.model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) | |
| def fitModel(self): | |
| print("Training the model") | |
| # targets must be (batch, timesteps, 1) for sparse loss | |
| y_sparse = np.expand_dims(self.tgt_output, -1) | |
| # training the model | |
| self.model.fit([self.src_seq, self.tgt_input], y_sparse, batch_size=self.batch_size, epochs=self.epochs, verbose=1) | |
| def saveModel(self,Modelname,InputTokenizer,OutputTokenizer,Input_max_len_src,Output_max_len_tgt): | |
| print("Saving the model") | |
| folder = Modelname | |
| if not os.path.exists(folder): | |
| os.makedirs(folder) | |
| self.model.save(f'{Modelname}/{Modelname}.keras') | |
| with open(f'{Modelname}/{InputTokenizer}.pkl','wb') as f: | |
| pickle.dump(self.tokenizer_e,f) | |
| with open(f'{Modelname}/{OutputTokenizer}.pkl','wb') as f: | |
| pickle.dump(self.tokenizer_f,f) | |
| with open(f'{Modelname}/{Input_max_len_src}.pkl','wb') as f: | |
| pickle.dump(self.max_len_src,f) | |
| with open(f'{Modelname}/{Output_max_len_tgt}.pkl','wb') as f: | |
| pickle.dump(self.max_len_tgt,f) | |
| def CreateModel(data,firstCol,SecondCol,name,InputTokenizer,OutputTokenizer,Input_max_len_src,Output_max_len_tgt,epochs=30,batch_size=64): | |
| model=Create(data,firstCol,SecondCol,epochs,batch_size) | |
| model.dataClean() | |
| model.compileModel() | |
| model.fitModel() | |
| model.saveModel(name,InputTokenizer,OutputTokenizer,Input_max_len_src,Output_max_len_tgt) | |