File size: 1,507 Bytes
38c5064 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
from tokenizers import Tokenizer
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
# 1. Load tα»« file tokenizer.json
src_tokenizer = Tokenizer.from_file("khmer_tokenizer.json")
tgt_tokenizer = Tokenizer.from_file("vietnamese_tokenizer.json")
model = load_model("translate_khmer_to_vi_fix_final.keras")
max_len_src= 963
max_len_tgt= 268
# model.summary()
def translate_sentence(sentence):
sentence_ids = src_tokenizer.encode(sentence).ids
sentence_decode_in = tgt_tokenizer.encode("<s>" + sentence).ids
# return sentence_ids
x_train = pad_sequences([sentence_ids], maxlen=max_len_src, padding='post')
train_encoder = pad_sequences([sentence_decode_in],maxlen=max_len_tgt-1,padding='post')
output = model.predict([x_train,train_encoder])
# return output
predicted_ids = np.argmax(output[0], axis=-1)
# return predicted_ids
# return tgt_tokenizer.decode(output[0]), tgt_tokenizer.decode(output[1])
tokens = tgt_tokenizer.decode(predicted_ids)
return tokens
# return predicted_ids
khmer_sentence = "αααααΆαααααααααααααΈααΎαααααΆαα’ααααΆαα·ααααΆαααΎααααααΆααα½αα
ααα½ααααααΆαααΎαα‘αΎααα
αααα»αααΆαααα’αα‘α·α
ααααΌαα·α VietNamNet αααααΆααααααΉαβ¦"
print(translate_sentence(khmer_sentence))
|