fromozu commited on
Commit
2de84f5
·
verified ·
1 Parent(s): 75e2413

Upload bilingual_book_maker/book_maker/cli.py with huggingface_hub

Browse files
bilingual_book_maker/book_maker/cli.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ from os import environ as env
5
+
6
+ from book_maker.backmatter import DEFAULT_BACKMATTER_TITLES, split_backmatter_titles
7
+ from book_maker.loader import BOOK_LOADER_DICT
8
+ from book_maker.translator import MODEL_DICT
9
+ from book_maker.utils import LANGUAGES, TO_LANGUAGE_CODE
10
+
11
+
12
+ def parse_prompt_arg(prompt_arg):
13
+ prompt = None
14
+ if prompt_arg is None:
15
+ return prompt
16
+
17
+ if not any(prompt_arg.endswith(ext) for ext in [".json", ".txt"]):
18
+ try:
19
+ # user can define prompt by passing a json string
20
+ # eg: --prompt '{"system": "You are a professional translator who translates computer technology books", "user": "Translate \`{text}\` to {language}"}'
21
+ prompt = json.loads(prompt_arg)
22
+ except json.JSONDecodeError:
23
+ # if not a json string, treat it as a template string
24
+ prompt = {"user": prompt_arg}
25
+
26
+ elif os.path.exists(prompt_arg):
27
+ if prompt_arg.endswith(".txt"):
28
+ # if it's a txt file, treat it as a template string
29
+ with open(prompt_arg, encoding="utf-8") as f:
30
+ prompt = {"user": f.read()}
31
+ elif prompt_arg.endswith(".json"):
32
+ # if it's a json file, treat it as a json object
33
+ # eg: --prompt prompt_template_sample.json
34
+ with open(prompt_arg, encoding="utf-8") as f:
35
+ prompt = json.load(f)
36
+ else:
37
+ raise FileNotFoundError(f"{prompt_arg} not found")
38
+
39
+ if prompt is None or any(c not in prompt["user"] for c in ["{text}", "{language}"]):
40
+ raise ValueError("prompt must contain `{text}` and `{language}`")
41
+
42
+ if "user" not in prompt:
43
+ raise ValueError("prompt must contain the key of `user`")
44
+
45
+ if (prompt.keys() - {"user", "system"}) != set():
46
+ raise ValueError("prompt can only contain the keys of `user` and `system`")
47
+
48
+ print("prompt config:", prompt)
49
+ return prompt
50
+
51
+
52
+ def main():
53
+ translate_model_list = list(MODEL_DICT.keys())
54
+ parser = argparse.ArgumentParser()
55
+ parser.add_argument(
56
+ "--book_name",
57
+ dest="book_name",
58
+ type=str,
59
+ help="path of the epub file to be translated",
60
+ )
61
+ parser.add_argument(
62
+ "--book_from",
63
+ dest="book_from",
64
+ type=str,
65
+ choices=["kobo"], # support kindle later
66
+ metavar="E-READER",
67
+ help="e-reader type, available: {%(choices)s}",
68
+ )
69
+ parser.add_argument(
70
+ "--device_path",
71
+ dest="device_path",
72
+ type=str,
73
+ help="Path of e-reader device",
74
+ )
75
+ ########## KEYS ##########
76
+ parser.add_argument(
77
+ "--openai_key",
78
+ dest="openai_key",
79
+ type=str,
80
+ default="",
81
+ help="OpenAI api key,if you have more than one key, please use comma"
82
+ " to split them to go beyond the rate limits",
83
+ )
84
+ parser.add_argument(
85
+ "--caiyun_key",
86
+ dest="caiyun_key",
87
+ type=str,
88
+ help="you can apply caiyun key from here (https://dashboard.caiyunapp.com/user/sign_in/)",
89
+ )
90
+ parser.add_argument(
91
+ "--deepl_key",
92
+ dest="deepl_key",
93
+ type=str,
94
+ help="you can apply deepl key from here (https://rapidapi.com/splintPRO/api/dpl-translator",
95
+ )
96
+ parser.add_argument(
97
+ "--claude_key",
98
+ dest="claude_key",
99
+ type=str,
100
+ help="you can find claude key from here (https://console.anthropic.com/account/keys)",
101
+ )
102
+
103
+ parser.add_argument(
104
+ "--custom_api",
105
+ dest="custom_api",
106
+ type=str,
107
+ help="you should build your own translation api",
108
+ )
109
+
110
+ # for Google Gemini
111
+ parser.add_argument(
112
+ "--gemini_key",
113
+ dest="gemini_key",
114
+ type=str,
115
+ help="You can get Gemini Key from https://makersuite.google.com/app/apikey",
116
+ )
117
+
118
+ parser.add_argument(
119
+ "--test",
120
+ dest="test",
121
+ action="store_true",
122
+ help="only the first 10 paragraphs will be translated, for testing",
123
+ )
124
+ parser.add_argument(
125
+ "--test_num",
126
+ dest="test_num",
127
+ type=int,
128
+ default=10,
129
+ help="how many paragraphs will be translated for testing",
130
+ )
131
+ parser.add_argument(
132
+ "-m",
133
+ "--model",
134
+ dest="model",
135
+ type=str,
136
+ default="chatgptapi",
137
+ choices=translate_model_list, # support DeepL later
138
+ metavar="MODEL",
139
+ help="model to use, available: {%(choices)s}",
140
+ )
141
+ parser.add_argument(
142
+ "--language",
143
+ type=str,
144
+ choices=sorted(LANGUAGES.keys())
145
+ + sorted([k.title() for k in TO_LANGUAGE_CODE]),
146
+ default="zh-hans",
147
+ metavar="LANGUAGE",
148
+ help="language to translate to, available: {%(choices)s}",
149
+ )
150
+ parser.add_argument(
151
+ "--resume",
152
+ dest="resume",
153
+ action="store_true",
154
+ help="if program stop unexpected you can use this to resume",
155
+ )
156
+ parser.add_argument(
157
+ "-p",
158
+ "--proxy",
159
+ dest="proxy",
160
+ type=str,
161
+ default="",
162
+ help="use proxy like http://127.0.0.1:7890",
163
+ )
164
+ parser.add_argument(
165
+ "--deployment_id",
166
+ dest="deployment_id",
167
+ type=str,
168
+ help="the deployment name you chose when you deployed the model",
169
+ )
170
+ # args to change api_base
171
+ parser.add_argument(
172
+ "--api_base",
173
+ metavar="API_BASE_URL",
174
+ dest="api_base",
175
+ type=str,
176
+ help="specify base url other than the OpenAI's official API address",
177
+ )
178
+ parser.add_argument(
179
+ "--exclude_filelist",
180
+ dest="exclude_filelist",
181
+ type=str,
182
+ default="",
183
+ help="if you have more than one file to exclude, please use comma to split them, example: --exclude_filelist 'nav.xhtml,cover.xhtml'",
184
+ )
185
+ parser.add_argument(
186
+ "--only_filelist",
187
+ dest="only_filelist",
188
+ type=str,
189
+ default="",
190
+ help="if you only have a few files with translations, please use comma to split them, example: --only_filelist 'nav.xhtml,cover.xhtml'",
191
+ )
192
+ parser.add_argument(
193
+ "--translate-tags",
194
+ dest="translate_tags",
195
+ type=str,
196
+ default="p,h1,h2,h3,h4,li,div",
197
+ help="example --translate-tags p,blockquote",
198
+ )
199
+ parser.add_argument(
200
+ "--exclude_translate-tags",
201
+ dest="exclude_translate_tags",
202
+ type=str,
203
+ default="sup",
204
+ help="example --exclude_translate-tags table,sup",
205
+ )
206
+ parser.add_argument(
207
+ "--allow_navigable_strings",
208
+ dest="allow_navigable_strings",
209
+ action="store_true",
210
+ default=False,
211
+ help="allow NavigableStrings to be translated",
212
+ )
213
+ parser.add_argument(
214
+ "--prompt",
215
+ dest="prompt_arg",
216
+ type=str,
217
+ metavar="PROMPT_ARG",
218
+ help="used for customizing the prompt. It can be the prompt template string, or a path to the template file. The valid placeholders are `{text}` and `{language}`.",
219
+ )
220
+ parser.add_argument(
221
+ "--accumulated_num",
222
+ dest="accumulated_num",
223
+ type=int,
224
+ default=1,
225
+ help="""Wait for how many tokens have been accumulated before starting the translation.
226
+ gpt3.5 limits the total_token to 4090.
227
+ For example, if you use --accumulated_num 1600, maybe openai will output 2200 tokens
228
+ and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000,
229
+ So you are close to reaching the limit. You have to choose your own value, there is no way to know if the limit is reached before sending
230
+ """,
231
+ )
232
+ parser.add_argument(
233
+ "--translation_style",
234
+ dest="translation_style",
235
+ type=str,
236
+ help="""ex: --translation_style "color: #808080; font-style: italic;" """,
237
+ )
238
+ parser.add_argument(
239
+ "--batch_size",
240
+ dest="batch_size",
241
+ type=int,
242
+ help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
243
+ )
244
+ parser.add_argument(
245
+ "--checkpoint_interval",
246
+ dest="checkpoint_interval",
247
+ type=int,
248
+ default=50,
249
+ help="save EPUB resume checkpoint after this many processed units",
250
+ )
251
+ parser.add_argument(
252
+ "--skip_backmatter_after_percent",
253
+ dest="skip_backmatter_after_percent",
254
+ type=int,
255
+ default=50,
256
+ help="after this percent of EPUB progress, backmatter chapters can be skipped",
257
+ )
258
+ parser.add_argument(
259
+ "--skip_backmatter_titles",
260
+ dest="skip_backmatter_titles",
261
+ type=str,
262
+ default=",".join(DEFAULT_BACKMATTER_TITLES),
263
+ help="comma-separated EPUB backmatter chapter titles to stop translating after the threshold",
264
+ )
265
+ parser.add_argument(
266
+ "--review_mode",
267
+ dest="review_mode",
268
+ type=str,
269
+ default="suspicious_only",
270
+ choices=["always", "suspicious_only", "never"],
271
+ help="review strategy for gemini translator",
272
+ )
273
+ parser.add_argument(
274
+ "--review_min_chinese_ratio",
275
+ dest="review_min_chinese_ratio",
276
+ type=float,
277
+ default=0.2,
278
+ help="minimum chinese character ratio before gemini review is skipped",
279
+ )
280
+ parser.add_argument(
281
+ "--review_length_ratio_min",
282
+ dest="review_length_ratio_min",
283
+ type=float,
284
+ default=0.35,
285
+ help="minimum translated/source length ratio before gemini review is triggered",
286
+ )
287
+ parser.add_argument(
288
+ "--review_length_ratio_max",
289
+ dest="review_length_ratio_max",
290
+ type=float,
291
+ default=2.5,
292
+ help="maximum translated/source length ratio before gemini review is triggered",
293
+ )
294
+ parser.add_argument(
295
+ "--retranslate",
296
+ dest="retranslate",
297
+ nargs=4,
298
+ type=str,
299
+ help="""--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)
300
+ Retranslate from start_str to end_str's tag:
301
+ python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'
302
+ Retranslate start_str's tag:
303
+ python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'
304
+ """,
305
+ )
306
+ parser.add_argument(
307
+ "--single_translate",
308
+ action="store_true",
309
+ help="output translated book, no bilingual",
310
+ )
311
+ parser.add_argument(
312
+ "--use_context",
313
+ dest="context_flag",
314
+ action="store_true",
315
+ help="adds an additional paragraph for global, updating historical context of the story to the model's input, improving the narrative consistency for the AI model (this uses ~200 more tokens each time)",
316
+ )
317
+ parser.add_argument(
318
+ "--temperature",
319
+ type=float,
320
+ default=1.0,
321
+ help="temperature parameter for `chatgptapi`/`gpt4`/`claude`",
322
+ )
323
+
324
+ options = parser.parse_args()
325
+
326
+ if not os.path.isfile(options.book_name):
327
+ print(f"Error: {options.book_name} does not exist.")
328
+ exit(1)
329
+
330
+ PROXY = options.proxy
331
+ if PROXY != "":
332
+ os.environ["http_proxy"] = PROXY
333
+ os.environ["https_proxy"] = PROXY
334
+
335
+ translate_model = MODEL_DICT.get(options.model)
336
+ assert translate_model is not None, "unsupported model"
337
+ API_KEY = ""
338
+ if options.model in ["chatgptapi", "gpt4"]:
339
+ if OPENAI_API_KEY := (
340
+ options.openai_key
341
+ or env.get(
342
+ "OPENAI_API_KEY",
343
+ ) # XXX: for backward compatibility, deprecate soon
344
+ or env.get(
345
+ "BBM_OPENAI_API_KEY",
346
+ ) # suggest adding `BBM_` prefix for all the bilingual_book_maker ENVs.
347
+ ):
348
+ API_KEY = OPENAI_API_KEY
349
+ # patch
350
+ else:
351
+ raise Exception(
352
+ "OpenAI API key not provided, please google how to obtain it",
353
+ )
354
+ elif options.model == "caiyun":
355
+ API_KEY = options.caiyun_key or env.get("BBM_CAIYUN_API_KEY")
356
+ if not API_KEY:
357
+ raise Exception("Please provide caiyun key")
358
+ elif options.model == "deepl":
359
+ API_KEY = options.deepl_key or env.get("BBM_DEEPL_API_KEY")
360
+ if not API_KEY:
361
+ raise Exception("Please provide deepl key")
362
+ elif options.model == "claude":
363
+ API_KEY = options.claude_key or env.get("BBM_CLAUDE_API_KEY")
364
+ if not API_KEY:
365
+ raise Exception("Please provide claude key")
366
+ elif options.model == "customapi":
367
+ API_KEY = options.custom_api or env.get("BBM_CUSTOM_API")
368
+ if not API_KEY:
369
+ raise Exception("Please provide custom translate api")
370
+ elif options.model == "gemini":
371
+ API_KEY = options.gemini_key or env.get("BBM_GOOGLE_GEMINI_KEY")
372
+ else:
373
+ API_KEY = ""
374
+
375
+ if options.book_from == "kobo":
376
+ from book_maker import obok
377
+
378
+ device_path = options.device_path
379
+ if device_path is None:
380
+ raise Exception(
381
+ "Device path is not given, please specify the path by --device_path <DEVICE_PATH>",
382
+ )
383
+ options.book_name = obok.cli_main(device_path)
384
+
385
+ book_type = options.book_name.split(".")[-1]
386
+ support_type_list = list(BOOK_LOADER_DICT.keys())
387
+ if book_type not in support_type_list:
388
+ raise Exception(
389
+ f"now only support files of these formats: {','.join(support_type_list)}",
390
+ )
391
+
392
+ book_loader = BOOK_LOADER_DICT.get(book_type)
393
+ assert book_loader is not None, "unsupported loader"
394
+ language = options.language
395
+ if options.language in LANGUAGES:
396
+ # use the value for prompt
397
+ language = LANGUAGES.get(language, language)
398
+
399
+ # change api_base for issue #42
400
+ model_api_base = options.api_base
401
+
402
+ e = book_loader(
403
+ options.book_name,
404
+ translate_model,
405
+ API_KEY,
406
+ options.resume,
407
+ language=language,
408
+ model_api_base=model_api_base,
409
+ is_test=options.test,
410
+ test_num=options.test_num,
411
+ prompt_config=parse_prompt_arg(options.prompt_arg),
412
+ single_translate=options.single_translate,
413
+ context_flag=options.context_flag,
414
+ temperature=options.temperature,
415
+ )
416
+ # other options
417
+ if options.allow_navigable_strings:
418
+ e.allow_navigable_strings = True
419
+ if options.translate_tags:
420
+ e.translate_tags = options.translate_tags
421
+ if options.exclude_translate_tags:
422
+ e.exclude_translate_tags = options.exclude_translate_tags
423
+ if options.exclude_filelist:
424
+ e.exclude_filelist = options.exclude_filelist
425
+ if options.only_filelist:
426
+ e.only_filelist = options.only_filelist
427
+ if options.accumulated_num > 1:
428
+ e.accumulated_num = options.accumulated_num
429
+ if options.translation_style:
430
+ e.translation_style = options.translation_style
431
+ if options.batch_size:
432
+ e.batch_size = options.batch_size
433
+ if options.checkpoint_interval is not None:
434
+ e.checkpoint_interval = options.checkpoint_interval
435
+ e.skip_backmatter_after_percent = max(0, options.skip_backmatter_after_percent)
436
+ e.skip_backmatter_titles = split_backmatter_titles(options.skip_backmatter_titles)
437
+ if options.model == "gemini":
438
+ e.translate_model.review_mode = options.review_mode
439
+ e.translate_model.review_min_chinese_ratio = options.review_min_chinese_ratio
440
+ e.translate_model.review_length_ratio_min = options.review_length_ratio_min
441
+ e.translate_model.review_length_ratio_max = options.review_length_ratio_max
442
+ if options.retranslate:
443
+ e.retranslate = options.retranslate
444
+ if options.deployment_id:
445
+ # only work for ChatGPT api for now
446
+ # later maybe support others
447
+ assert options.model in [
448
+ "chatgptapi",
449
+ "gpt4",
450
+ ], "only support chatgptapi for deployment_id"
451
+ if not options.api_base:
452
+ raise ValueError("`api_base` must be provided when using `deployment_id`")
453
+ e.translate_model.set_deployment_id(options.deployment_id)
454
+ # TODO refactor, quick fix for gpt4 model
455
+ if options.model == "gpt4":
456
+ e.translate_model.set_gpt4_models("gpt4")
457
+
458
+ e.make_bilingual_book()
459
+
460
+
461
+ if __name__ == "__main__":
462
+ main()