datasetsANDmodels commited on
Commit
1d8a377
·
verified ·
1 Parent(s): ae974c1

Upload usage_bpe.py

Browse files
Files changed (1) hide show
  1. usage_bpe.py +78 -0
usage_bpe.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import time
3
+ from subword_nmt.apply_bpe import BPE
4
+ import ctranslate2
5
+
6
+ class TranslationServer:
7
+
8
+ def clean_text(self, text):
9
+ """Clean input text."""
10
+ return " ".join(text.strip().split())
11
+
12
+ def normalize_output(self, text):
13
+ """Normalize translation output."""
14
+ replacements = {
15
+ "▁": " ",
16
+ "'": "'",
17
+ """: "\"",
18
+ "&": "&",
19
+ "@@": "",
20
+ }
21
+ for old, new in replacements.items():
22
+ text = text.replace(old, new)
23
+ # remove double spaces
24
+ return " ".join(text.split()).strip()
25
+
26
+ def __init__(self, model_path="en_id"):
27
+ self.model_path = model_path
28
+ self.bpe_path = f"{model_path}/bpe.model"
29
+ self.vocab_path = f"{model_path}/shared_vocabulary.json"
30
+
31
+ # Load BPE
32
+ with open(self.bpe_path, "r", encoding="utf-8") as bpe_file:
33
+ self.bpe = BPE(bpe_file)
34
+
35
+ # Load vocab
36
+ with open(self.vocab_path, "r", encoding="utf-8") as f:
37
+ self.vocab = json.load(f)
38
+
39
+ self.token_to_id = {token: i for i, token in enumerate(self.vocab)}
40
+ self.id_to_token = {i: token for i, token in enumerate(self.vocab)}
41
+
42
+ # Load CTranslate2 model
43
+ self.translator = ctranslate2.Translator(model_path)
44
+
45
+ def translate_bpe(self, text, beam_size=5):
46
+
47
+ # Tokenize with BPE
48
+ tokens = self.bpe.process_line(text).split()
49
+
50
+ # Run translation
51
+ result = self.translator.translate_batch(
52
+ [tokens],
53
+ beam_size=beam_size,
54
+ length_penalty=1.0
55
+ )
56
+
57
+ # Extract tokens (new CTranslate2 API)
58
+ output_tokens = result[0].hypotheses[0]
59
+
60
+ # Join tokens & de-BPE
61
+ output = " ".join(output_tokens)
62
+ output = self.normalize_output(output)
63
+
64
+ return output
65
+
66
+ def translate_text(self, text):
67
+ text = self.clean_text(text)
68
+ output = self.translate_bpe(text)
69
+ return output
70
+
71
+
72
+ if __name__ == "__main__":
73
+ server = TranslationServer("en_id")
74
+
75
+ #text = "Saya menelepon dari kantor pajak."
76
+ text = "I am calling from tax office."
77
+ print(server.translate_text(text))
78
+