fromozu commited on
Commit
4bea626
·
verified ·
1 Parent(s): 2de84f5

Upload bilingual_book_maker/book_maker/clib.py with huggingface_hub

Browse files
bilingual_book_maker/book_maker/clib.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ from os import environ as env
5
+
6
+ from book_maker.loader import BOOK_LOADER_DICT
7
+ from book_maker.translator import MODEL_DICT
8
+ from book_maker.utils import LANGUAGES, TO_LANGUAGE_CODE
9
+
10
+
11
+ def parse_prompt_arg(prompt_arg):
12
+ prompt = None
13
+ if prompt_arg is None:
14
+ return prompt
15
+
16
+ if not any(prompt_arg.endswith(ext) for ext in [".json", ".txt"]):
17
+ try:
18
+ # user can define prompt by passing a json string
19
+ # eg: --prompt '{"system": "You are a professional translator who translates computer technology books", "user": "Translate \`{text}\` to {language}"}'
20
+ prompt = json.loads(prompt_arg)
21
+ except json.JSONDecodeError:
22
+ # if not a json string, treat it as a template string
23
+ prompt = {"user": prompt_arg}
24
+
25
+ elif os.path.exists(prompt_arg):
26
+ if prompt_arg.endswith(".txt"):
27
+ # if it's a txt file, treat it as a template string
28
+ with open(prompt_arg, encoding="utf-8") as f:
29
+ prompt = {"user": f.read()}
30
+ elif prompt_arg.endswith(".json"):
31
+ # if it's a json file, treat it as a json object
32
+ # eg: --prompt prompt_template_sample.json
33
+ with open(prompt_arg, encoding="utf-8") as f:
34
+ prompt = json.load(f)
35
+ else:
36
+ raise FileNotFoundError(f"{prompt_arg} not found")
37
+
38
+ if prompt is None or any(c not in prompt["user"] for c in ["{text}", "{language}"]):
39
+ raise ValueError("prompt must contain `{text}` and `{language}`")
40
+
41
+ if "user" not in prompt:
42
+ raise ValueError("prompt must contain the key of `user`")
43
+
44
+ if (prompt.keys() - {"user", "system"}) != set():
45
+ raise ValueError("prompt can only contain the keys of `user` and `system`")
46
+
47
+ print("prompt config:", prompt)
48
+ return prompt
49
+
50
+
51
+ def main():
52
+ translate_model_list = list(MODEL_DICT.keys())
53
+ parser = argparse.ArgumentParser()
54
+ parser.add_argument(
55
+ "--book_name",
56
+ dest="book_name",
57
+ type=str,
58
+ help="path of the epub file to be translated",
59
+ )
60
+ parser.add_argument(
61
+ "--book_from",
62
+ dest="book_from",
63
+ type=str,
64
+ choices=["kobo"], # support kindle later
65
+ metavar="E-READER",
66
+ help="e-reader type, available: {%(choices)s}",
67
+ )
68
+ parser.add_argument(
69
+ "--device_path",
70
+ dest="device_path",
71
+ type=str,
72
+ help="Path of e-reader device",
73
+ )
74
+ ########## KEYS ##########
75
+ parser.add_argument(
76
+ "--openai_key",
77
+ dest="openai_key",
78
+ type=str,
79
+ default="",
80
+ help="OpenAI api key,if you have more than one key, please use comma"
81
+ " to split them to go beyond the rate limits",
82
+ )
83
+ parser.add_argument(
84
+ "--caiyun_key",
85
+ dest="caiyun_key",
86
+ type=str,
87
+ help="you can apply caiyun key from here (https://dashboard.caiyunapp.com/user/sign_in/)",
88
+ )
89
+ parser.add_argument(
90
+ "--deepl_key",
91
+ dest="deepl_key",
92
+ type=str,
93
+ help="you can apply deepl key from here (https://rapidapi.com/splintPRO/api/dpl-translator",
94
+ )
95
+ parser.add_argument(
96
+ "--claude_key",
97
+ dest="claude_key",
98
+ type=str,
99
+ help="you can find claude key from here (https://console.anthropic.com/account/keys)",
100
+ )
101
+
102
+ parser.add_argument(
103
+ "--custom_api",
104
+ dest="custom_api",
105
+ type=str,
106
+ help="you should build your own translation api",
107
+ )
108
+
109
+ # for Google Gemini
110
+ parser.add_argument(
111
+ "--gemini_key",
112
+ dest="gemini_key",
113
+ type=str,
114
+ help="You can get Gemini Key from https://makersuite.google.com/app/apikey",
115
+ )
116
+
117
+ parser.add_argument(
118
+ "--test",
119
+ dest="test",
120
+ action="store_true",
121
+ help="only the first 10 paragraphs will be translated, for testing",
122
+ )
123
+ parser.add_argument(
124
+ "--test_num",
125
+ dest="test_num",
126
+ type=int,
127
+ default=10,
128
+ help="how many paragraphs will be translated for testing",
129
+ )
130
+ parser.add_argument(
131
+ "-m",
132
+ "--model",
133
+ dest="model",
134
+ type=str,
135
+ default="chatgptapi",
136
+ choices=translate_model_list, # support DeepL later
137
+ metavar="MODEL",
138
+ help="model to use, available: {%(choices)s}",
139
+ )
140
+ parser.add_argument(
141
+ "--language",
142
+ type=str,
143
+ choices=sorted(LANGUAGES.keys())
144
+ + sorted([k.title() for k in TO_LANGUAGE_CODE]),
145
+ default="zh-hans",
146
+ metavar="LANGUAGE",
147
+ help="language to translate to, available: {%(choices)s}",
148
+ )
149
+ parser.add_argument(
150
+ "--resume",
151
+ dest="resume",
152
+ action="store_true",
153
+ help="if program stop unexpected you can use this to resume",
154
+ )
155
+ parser.add_argument(
156
+ "-p",
157
+ "--proxy",
158
+ dest="proxy",
159
+ type=str,
160
+ default="",
161
+ help="use proxy like http://127.0.0.1:7890",
162
+ )
163
+ parser.add_argument(
164
+ "--deployment_id",
165
+ dest="deployment_id",
166
+ type=str,
167
+ help="the deployment name you chose when you deployed the model",
168
+ )
169
+ # args to change api_base
170
+ parser.add_argument(
171
+ "--api_base",
172
+ metavar="API_BASE_URL",
173
+ dest="api_base",
174
+ type=str,
175
+ help="specify base url other than the OpenAI's official API address",
176
+ )
177
+ parser.add_argument(
178
+ "--exclude_filelist",
179
+ dest="exclude_filelist",
180
+ type=str,
181
+ default="",
182
+ help="if you have more than one file to exclude, please use comma to split them, example: --exclude_filelist 'nav.xhtml,cover.xhtml'",
183
+ )
184
+ parser.add_argument(
185
+ "--only_filelist",
186
+ dest="only_filelist",
187
+ type=str,
188
+ default="",
189
+ help="if you only have a few files with translations, please use comma to split them, example: --only_filelist 'nav.xhtml,cover.xhtml'",
190
+ )
191
+ parser.add_argument(
192
+ "--translate-tags",
193
+ dest="translate_tags",
194
+ type=str,
195
+ default="p",
196
+ help="example --translate-tags p,blockquote",
197
+ )
198
+ parser.add_argument(
199
+ "--exclude_translate-tags",
200
+ dest="exclude_translate_tags",
201
+ type=str,
202
+ default="sup",
203
+ help="example --exclude_translate-tags table,sup",
204
+ )
205
+ parser.add_argument(
206
+ "--allow_navigable_strings",
207
+ dest="allow_navigable_strings",
208
+ action="store_true",
209
+ default=False,
210
+ help="allow NavigableStrings to be translated",
211
+ )
212
+ parser.add_argument(
213
+ "--prompt",
214
+ dest="prompt_arg",
215
+ type=str,
216
+ metavar="PROMPT_ARG",
217
+ help="used for customizing the prompt. It can be the prompt template string, or a path to the template file. The valid placeholders are `{text}` and `{language}`.",
218
+ )
219
+ parser.add_argument(
220
+ "--accumulated_num",
221
+ dest="accumulated_num",
222
+ type=int,
223
+ default=1,
224
+ help="""Wait for how many tokens have been accumulated before starting the translation.
225
+ gpt3.5 limits the total_token to 4090.
226
+ For example, if you use --accumulated_num 1600, maybe openai will output 2200 tokens
227
+ and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000,
228
+ So you are close to reaching the limit. You have to choose your own value, there is no way to know if the limit is reached before sending
229
+ """,
230
+ )
231
+ parser.add_argument(
232
+ "--translation_style",
233
+ dest="translation_style",
234
+ type=str,
235
+ help="""ex: --translation_style "color: #808080; font-style: italic;" """,
236
+ )
237
+ parser.add_argument(
238
+ "--batch_size",
239
+ dest="batch_size",
240
+ type=int,
241
+ help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
242
+ )
243
+ parser.add_argument(
244
+ "--retranslate",
245
+ dest="retranslate",
246
+ nargs=4,
247
+ type=str,
248
+ help="""--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)
249
+ Retranslate from start_str to end_str's tag:
250
+ python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'
251
+ Retranslate start_str's tag:
252
+ python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'
253
+ """,
254
+ )
255
+ parser.add_argument(
256
+ "--single_translate",
257
+ action="store_true",
258
+ help="output translated book, no bilingual",
259
+ )
260
+ parser.add_argument(
261
+ "--use_context",
262
+ dest="context_flag",
263
+ action="store_true",
264
+ help="adds an additional paragraph for global, updating historical context of the story to the model's input, improving the narrative consistency for the AI model (this uses ~200 more tokens each time)",
265
+ )
266
+ parser.add_argument(
267
+ "--temperature",
268
+ type=float,
269
+ default=1.0,
270
+ help="temperature parameter for `chatgptapi`/`gpt4`/`claude`",
271
+ )
272
+
273
+ options = parser.parse_args()
274
+
275
+ if not os.path.isfile(options.book_name):
276
+ print(f"Error: {options.book_name} does not exist.")
277
+ exit(1)
278
+
279
+ PROXY = options.proxy
280
+ if PROXY != "":
281
+ os.environ["http_proxy"] = PROXY
282
+ os.environ["https_proxy"] = PROXY
283
+
284
+ translate_model = MODEL_DICT.get(options.model)
285
+ assert translate_model is not None, "unsupported model"
286
+ API_KEY = ""
287
+ if options.model in ["chatgptapi", "gpt4"]:
288
+ if OPENAI_API_KEY := (
289
+ options.openai_key
290
+ or env.get(
291
+ "OPENAI_API_KEY",
292
+ ) # XXX: for backward compatibility, deprecate soon
293
+ or env.get(
294
+ "BBM_OPENAI_API_KEY",
295
+ ) # suggest adding `BBM_` prefix for all the bilingual_book_maker ENVs.
296
+ ):
297
+ API_KEY = OPENAI_API_KEY
298
+ # patch
299
+ else:
300
+ raise Exception(
301
+ "OpenAI API key not provided, please google how to obtain it",
302
+ )
303
+ elif options.model == "caiyun":
304
+ API_KEY = options.caiyun_key or env.get("BBM_CAIYUN_API_KEY")
305
+ if not API_KEY:
306
+ raise Exception("Please provide caiyun key")
307
+ elif options.model == "deepl":
308
+ API_KEY = options.deepl_key or env.get("BBM_DEEPL_API_KEY")
309
+ if not API_KEY:
310
+ raise Exception("Please provide deepl key")
311
+ elif options.model == "claude":
312
+ API_KEY = options.claude_key or env.get("BBM_CLAUDE_API_KEY")
313
+ if not API_KEY:
314
+ raise Exception("Please provide claude key")
315
+ elif options.model == "customapi":
316
+ API_KEY = options.custom_api or env.get("BBM_CUSTOM_API")
317
+ if not API_KEY:
318
+ raise Exception("Please provide custom translate api")
319
+ elif options.model == "gemini":
320
+ API_KEY = options.gemini_key or env.get("BBM_GOOGLE_GEMINI_KEY")
321
+ else:
322
+ API_KEY = ""
323
+
324
+ if options.book_from == "kobo":
325
+ from book_maker import obok
326
+
327
+ device_path = options.device_path
328
+ if device_path is None:
329
+ raise Exception(
330
+ "Device path is not given, please specify the path by --device_path <DEVICE_PATH>",
331
+ )
332
+ options.book_name = obok.cli_main(device_path)
333
+
334
+ book_type = options.book_name.split(".")[-1]
335
+ support_type_list = list(BOOK_LOADER_DICT.keys())
336
+ if book_type not in support_type_list:
337
+ raise Exception(
338
+ f"now only support files of these formats: {','.join(support_type_list)}",
339
+ )
340
+
341
+ book_loader = BOOK_LOADER_DICT.get(book_type)
342
+ assert book_loader is not None, "unsupported loader"
343
+ language = options.language
344
+ if options.language in LANGUAGES:
345
+ # use the value for prompt
346
+ language = LANGUAGES.get(language, language)
347
+
348
+ # change api_base for issue #42
349
+ model_api_base = options.api_base
350
+
351
+ e = book_loader(
352
+ options.book_name,
353
+ translate_model,
354
+ API_KEY,
355
+ options.resume,
356
+ language=language,
357
+ model_api_base=model_api_base,
358
+ is_test=options.test,
359
+ test_num=options.test_num,
360
+ prompt_config=parse_prompt_arg(options.prompt_arg),
361
+ single_translate=options.single_translate,
362
+ context_flag=options.context_flag,
363
+ temperature=options.temperature,
364
+ )
365
+ # other options
366
+ if options.allow_navigable_strings:
367
+ e.allow_navigable_strings = True
368
+ if options.translate_tags:
369
+ e.translate_tags = options.translate_tags
370
+ if options.exclude_translate_tags:
371
+ e.exclude_translate_tags = options.exclude_translate_tags
372
+ if options.exclude_filelist:
373
+ e.exclude_filelist = options.exclude_filelist
374
+ if options.only_filelist:
375
+ e.only_filelist = options.only_filelist
376
+ if options.accumulated_num > 1:
377
+ e.accumulated_num = options.accumulated_num
378
+ if options.translation_style:
379
+ e.translation_style = options.translation_style
380
+ if options.batch_size:
381
+ e.batch_size = options.batch_size
382
+ if options.retranslate:
383
+ e.retranslate = options.retranslate
384
+ if options.deployment_id:
385
+ # only work for ChatGPT api for now
386
+ # later maybe support others
387
+ assert options.model in [
388
+ "chatgptapi",
389
+ "gpt4",
390
+ ], "only support chatgptapi for deployment_id"
391
+ if not options.api_base:
392
+ raise ValueError("`api_base` must be provided when using `deployment_id`")
393
+ e.translate_model.set_deployment_id(options.deployment_id)
394
+ # TODO refactor, quick fix for gpt4 model
395
+ if options.model == "gpt4":
396
+ e.translate_model.set_gpt4_models("gpt4")
397
+
398
+ e.make_bilingual_book()
399
+
400
+
401
+ if __name__ == "__main__":
402
+ main()