AnTrc2 commited on
Commit
38c5064
Β·
verified Β·
1 Parent(s): f5f2283

Update run.py

Browse files
Files changed (1) hide show
  1. run.py +36 -36
run.py CHANGED
@@ -1,36 +1,36 @@
1
- from tokenizers import Tokenizer
2
- from tensorflow.keras.models import load_model
3
- from tensorflow.keras.preprocessing.sequence import pad_sequences
4
- import numpy as np
5
- # 1. Load tα»« file tokenizer.json
6
- src_tokenizer = Tokenizer.from_file("khmer_tokenizer.json")
7
- tgt_tokenizer = Tokenizer.from_file("vietnamese_tokenizer.json")
8
- model = load_model("translate_khmer_to_vi_fix_final.keras")
9
- max_len_src= 963
10
- max_len_tgt= 268
11
- model.summary()
12
- # def translate_sentence(sentence):
13
-
14
- # sentence_ids = src_tokenizer.encode(sentence).ids
15
- # sentence_decode_in = tgt_tokenizer.encode("<s>" + sentence).ids
16
-
17
- # # return sentence_ids
18
- # x_train = pad_sequences([sentence_ids], maxlen=max_len_src, padding='post')
19
- # train_encoder = pad_sequences([sentence_decode_in],maxlen=max_len_tgt-1,padding='post')
20
- # output = model.predict([x_train,train_encoder])
21
- # # return output
22
-
23
- # predicted_ids = np.argmax(output[0], axis=-1)
24
- # # return predicted_ids
25
- # # return tgt_tokenizer.decode(output[0]), tgt_tokenizer.decode(output[1])
26
- # tokens = tgt_tokenizer.decode(predicted_ids)
27
-
28
- # return tokens
29
-
30
- # # return predicted_ids
31
-
32
-
33
- # khmer_sentence = "αž›αŸ„αž€αž“αžΆαž™αž€αžšαžŠαŸ’αž‹αž˜αž“αŸ’αžšαŸ’αžαžΈαž‘αžΎαž”αžαŸ‚αž”αžΆαž“αž’αžαŸ’αžαžΆαž’αž·αž”αŸ’αž”αžΆαž™αž›αžΎαž–αŸαžαŸŒαž˜αžΆαž“αž˜αž½αž™αž…αŸ†αž“αž½αž“αžŠαŸ‚αž›αž”αžΆαž“αž›αžΎαž€αž‘αžΎαž„αž“αŸ…αž€αŸ’αž“αž»αž„αž€αžΆαžŸαŸ‚αžαž’αŸαž‘αž·αž…αžαŸ’αžšαžΌαž“αž·αž€ VietNamNet αžŠαŸ‚αž›αž‘αžΆαž€αŸ‹αž‘αž„αž“αžΉαž„β€¦"
34
-
35
-
36
- # print(translate_sentence(khmer_sentence))
 
1
+ from tokenizers import Tokenizer
2
+ from tensorflow.keras.models import load_model
3
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
4
+ import numpy as np
5
+ # 1. Load tα»« file tokenizer.json
6
+ src_tokenizer = Tokenizer.from_file("khmer_tokenizer.json")
7
+ tgt_tokenizer = Tokenizer.from_file("vietnamese_tokenizer.json")
8
+ model = load_model("translate_khmer_to_vi_fix_final.keras")
9
+ max_len_src= 963
10
+ max_len_tgt= 268
11
+ # model.summary()
12
+ def translate_sentence(sentence):
13
+
14
+ sentence_ids = src_tokenizer.encode(sentence).ids
15
+ sentence_decode_in = tgt_tokenizer.encode("<s>" + sentence).ids
16
+
17
+ # return sentence_ids
18
+ x_train = pad_sequences([sentence_ids], maxlen=max_len_src, padding='post')
19
+ train_encoder = pad_sequences([sentence_decode_in],maxlen=max_len_tgt-1,padding='post')
20
+ output = model.predict([x_train,train_encoder])
21
+ # return output
22
+
23
+ predicted_ids = np.argmax(output[0], axis=-1)
24
+ # return predicted_ids
25
+ # return tgt_tokenizer.decode(output[0]), tgt_tokenizer.decode(output[1])
26
+ tokens = tgt_tokenizer.decode(predicted_ids)
27
+
28
+ return tokens
29
+
30
+ # return predicted_ids
31
+
32
+
33
+ khmer_sentence = "αž›αŸ„αž€αž“αžΆαž™αž€αžšαžŠαŸ’αž‹αž˜αž“αŸ’αžšαŸ’αžαžΈαž‘αžΎαž”αžαŸ‚αž”αžΆαž“αž’αžαŸ’αžαžΆαž’αž·αž”αŸ’αž”αžΆαž™αž›αžΎαž–αŸαžαŸŒαž˜αžΆαž“αž˜αž½αž™αž…αŸ†αž“αž½αž“αžŠαŸ‚αž›αž”αžΆαž“αž›αžΎαž€αž‘αžΎαž„αž“αŸ…αž€αŸ’αž“αž»αž„αž€αžΆαžŸαŸ‚αžαž’αŸαž‘αž·αž…αžαŸ’αžšαžΌαž“αž·αž€ VietNamNet αžŠαŸ‚αž›αž‘αžΆαž€αŸ‹αž‘αž„αž“αžΉαž„β€¦"
34
+
35
+
36
+ print(translate_sentence(khmer_sentence))