|
|
from tokenizers import Tokenizer |
|
|
from tensorflow.keras.models import load_model |
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
import numpy as np |
|
|
|
|
|
src_tokenizer = Tokenizer.from_file("khmer_tokenizer.json") |
|
|
tgt_tokenizer = Tokenizer.from_file("vietnamese_tokenizer.json") |
|
|
model = load_model("translate_khmer_to_vi_fix_final.keras") |
|
|
max_len_src= 963 |
|
|
max_len_tgt= 268 |
|
|
|
|
|
def translate_sentence(sentence): |
|
|
|
|
|
sentence_ids = src_tokenizer.encode(sentence).ids |
|
|
sentence_decode_in = tgt_tokenizer.encode("<s>" + sentence).ids |
|
|
|
|
|
|
|
|
x_train = pad_sequences([sentence_ids], maxlen=max_len_src, padding='post') |
|
|
train_encoder = pad_sequences([sentence_decode_in],maxlen=max_len_tgt-1,padding='post') |
|
|
output = model.predict([x_train,train_encoder]) |
|
|
|
|
|
|
|
|
predicted_ids = np.argmax(output[0], axis=-1) |
|
|
|
|
|
|
|
|
tokens = tgt_tokenizer.decode(predicted_ids) |
|
|
|
|
|
return tokens |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
khmer_sentence = "αααααΆαααααααααααααΈααΎαααααΆαα’ααααΆαα·ααααΆαααΎααααααΆααα½αα
ααα½ααααααΆαααΎαα‘αΎααα
αααα»αααΆαααα’αα‘α·α
ααααΌαα·α VietNamNet αααααΆααααααΉαβ¦" |
|
|
|
|
|
|
|
|
print(translate_sentence(khmer_sentence)) |
|
|
|