File size: 1,507 Bytes
38c5064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from tokenizers import Tokenizer
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
# 1. Load tα»« file tokenizer.json
src_tokenizer = Tokenizer.from_file("khmer_tokenizer.json")
tgt_tokenizer = Tokenizer.from_file("vietnamese_tokenizer.json")
model = load_model("translate_khmer_to_vi_fix_final.keras")
max_len_src= 963
max_len_tgt= 268
# model.summary()
def translate_sentence(sentence):

    sentence_ids = src_tokenizer.encode(sentence).ids
    sentence_decode_in = tgt_tokenizer.encode("<s>" + sentence).ids

    # return sentence_ids
    x_train = pad_sequences([sentence_ids], maxlen=max_len_src, padding='post')
    train_encoder = pad_sequences([sentence_decode_in],maxlen=max_len_tgt-1,padding='post')
    output = model.predict([x_train,train_encoder])
    # return output

    predicted_ids = np.argmax(output[0], axis=-1)
    # return predicted_ids
    # return tgt_tokenizer.decode(output[0]), tgt_tokenizer.decode(output[1])
    tokens = tgt_tokenizer.decode(predicted_ids)

    return tokens

    # return predicted_ids


khmer_sentence = "αž›αŸ„αž€αž“αžΆαž™αž€αžšαžŠαŸ’αž‹αž˜αž“αŸ’αžšαŸ’αžαžΈαž‘αžΎαž”αžαŸ‚αž”αžΆαž“αž’αžαŸ’αžαžΆαž’αž·αž”αŸ’αž”αžΆαž™αž›αžΎαž–αŸαžαŸŒαž˜αžΆαž“αž˜αž½αž™αž…αŸ†αž“αž½αž“αžŠαŸ‚αž›αž”αžΆαž“αž›αžΎαž€αž‘αžΎαž„αž“αŸ…αž€αŸ’αž“αž»αž„αž€αžΆαžŸαŸ‚αžαž’αŸαž‘αž·αž…αžαŸ’αžšαžΌαž“αž·αž€ VietNamNet αžŠαŸ‚αž›αž‘αžΆαž€αŸ‹αž‘αž„αž“αžΉαž„β€¦"


print(translate_sentence(khmer_sentence))