khmer_to_vi / run.py
AnTrc2's picture
Update run.py
38c5064 verified
from tokenizers import Tokenizer
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
# 1. Load tα»« file tokenizer.json
src_tokenizer = Tokenizer.from_file("khmer_tokenizer.json")
tgt_tokenizer = Tokenizer.from_file("vietnamese_tokenizer.json")
model = load_model("translate_khmer_to_vi_fix_final.keras")
max_len_src= 963
max_len_tgt= 268
# model.summary()
def translate_sentence(sentence):
sentence_ids = src_tokenizer.encode(sentence).ids
sentence_decode_in = tgt_tokenizer.encode("<s>" + sentence).ids
# return sentence_ids
x_train = pad_sequences([sentence_ids], maxlen=max_len_src, padding='post')
train_encoder = pad_sequences([sentence_decode_in],maxlen=max_len_tgt-1,padding='post')
output = model.predict([x_train,train_encoder])
# return output
predicted_ids = np.argmax(output[0], axis=-1)
# return predicted_ids
# return tgt_tokenizer.decode(output[0]), tgt_tokenizer.decode(output[1])
tokens = tgt_tokenizer.decode(predicted_ids)
return tokens
# return predicted_ids
khmer_sentence = "αž›αŸ„αž€αž“αžΆαž™αž€αžšαžŠαŸ’αž‹αž˜αž“αŸ’αžšαŸ’αžαžΈαž‘αžΎαž”αžαŸ‚αž”αžΆαž“αž’αžαŸ’αžαžΆαž’αž·αž”αŸ’αž”αžΆαž™αž›αžΎαž–αŸαžαŸŒαž˜αžΆαž“αž˜αž½αž™αž…αŸ†αž“αž½αž“αžŠαŸ‚αž›αž”αžΆαž“αž›αžΎαž€αž‘αžΎαž„αž“αŸ…αž€αŸ’αž“αž»αž„αž€αžΆαžŸαŸ‚αžαž’αŸαž‘αž·αž…αžαŸ’αžšαžΌαž“αž·αž€ VietNamNet αžŠαŸ‚αž›αž‘αžΆαž€αŸ‹αž‘αž„αž“αžΉαž„β€¦"
print(translate_sentence(khmer_sentence))