Spaces:

fromozu
/

ebook-executor

Paused

App Files Files Community

ebook-executor / bilingual_book_maker /book_maker /cli.py

fromozu

Upload bilingual_book_maker/book_maker/cli.py with huggingface_hub

2de84f5 verified about 1 month ago

raw

history blame contribute delete

16.4 kB

	import argparse
	import json
	import os
	from os import environ as env

	from book_maker.backmatter import DEFAULT_BACKMATTER_TITLES, split_backmatter_titles
	from book_maker.loader import BOOK_LOADER_DICT
	from book_maker.translator import MODEL_DICT
	from book_maker.utils import LANGUAGES, TO_LANGUAGE_CODE


	def parse_prompt_arg(prompt_arg):
	prompt = None
	if prompt_arg is None:
	return prompt

	if not any(prompt_arg.endswith(ext) for ext in [".json", ".txt"]):
	try:
	# user can define prompt by passing a json string
	# eg: --prompt '{"system": "You are a professional translator who translates computer technology books", "user": "Translate \`{text}\` to {language}"}'
	prompt = json.loads(prompt_arg)
	except json.JSONDecodeError:
	# if not a json string, treat it as a template string
	prompt = {"user": prompt_arg}

	elif os.path.exists(prompt_arg):
	if prompt_arg.endswith(".txt"):
	# if it's a txt file, treat it as a template string
	with open(prompt_arg, encoding="utf-8") as f:
	prompt = {"user": f.read()}
	elif prompt_arg.endswith(".json"):
	# if it's a json file, treat it as a json object
	# eg: --prompt prompt_template_sample.json
	with open(prompt_arg, encoding="utf-8") as f:
	prompt = json.load(f)
	else:
	raise FileNotFoundError(f"{prompt_arg} not found")

	if prompt is None or any(c not in prompt["user"] for c in ["{text}", "{language}"]):
	raise ValueError("prompt must contain `{text}` and `{language}`")

	if "user" not in prompt:
	raise ValueError("prompt must contain the key of `user`")

	if (prompt.keys() - {"user", "system"}) != set():
	raise ValueError("prompt can only contain the keys of `user` and `system`")

	print("prompt config:", prompt)
	return prompt


	def main():
	translate_model_list = list(MODEL_DICT.keys())
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--book_name",
	dest="book_name",
	type=str,
	help="path of the epub file to be translated",
	)
	parser.add_argument(
	"--book_from",
	dest="book_from",
	type=str,
	choices=["kobo"], # support kindle later
	metavar="E-READER",
	help="e-reader type, available: {%(choices)s}",
	)
	parser.add_argument(
	"--device_path",
	dest="device_path",
	type=str,
	help="Path of e-reader device",
	)
	########## KEYS ##########
	parser.add_argument(
	"--openai_key",
	dest="openai_key",
	type=str,
	default="",
	help="OpenAI api key,if you have more than one key, please use comma"
	" to split them to go beyond the rate limits",
	)
	parser.add_argument(
	"--caiyun_key",
	dest="caiyun_key",
	type=str,
	help="you can apply caiyun key from here (https://dashboard.caiyunapp.com/user/sign_in/)",
	)
	parser.add_argument(
	"--deepl_key",
	dest="deepl_key",
	type=str,
	help="you can apply deepl key from here (https://rapidapi.com/splintPRO/api/dpl-translator",
	)
	parser.add_argument(
	"--claude_key",
	dest="claude_key",
	type=str,
	help="you can find claude key from here (https://console.anthropic.com/account/keys)",
	)

	parser.add_argument(
	"--custom_api",
	dest="custom_api",
	type=str,
	help="you should build your own translation api",
	)

	# for Google Gemini
	parser.add_argument(
	"--gemini_key",
	dest="gemini_key",
	type=str,
	help="You can get Gemini Key from https://makersuite.google.com/app/apikey",
	)

	parser.add_argument(
	"--test",
	dest="test",
	action="store_true",
	help="only the first 10 paragraphs will be translated, for testing",
	)
	parser.add_argument(
	"--test_num",
	dest="test_num",
	type=int,
	default=10,
	help="how many paragraphs will be translated for testing",
	)
	parser.add_argument(
	"-m",
	"--model",
	dest="model",
	type=str,
	default="chatgptapi",
	choices=translate_model_list, # support DeepL later
	metavar="MODEL",
	help="model to use, available: {%(choices)s}",
	)
	parser.add_argument(
	"--language",
	type=str,
	choices=sorted(LANGUAGES.keys())
	+ sorted([k.title() for k in TO_LANGUAGE_CODE]),
	default="zh-hans",
	metavar="LANGUAGE",
	help="language to translate to, available: {%(choices)s}",
	)
	parser.add_argument(
	"--resume",
	dest="resume",
	action="store_true",
	help="if program stop unexpected you can use this to resume",
	)
	parser.add_argument(
	"-p",
	"--proxy",
	dest="proxy",
	type=str,
	default="",
	help="use proxy like http://127.0.0.1:7890",
	)
	parser.add_argument(
	"--deployment_id",
	dest="deployment_id",
	type=str,
	help="the deployment name you chose when you deployed the model",
	)
	# args to change api_base
	parser.add_argument(
	"--api_base",
	metavar="API_BASE_URL",
	dest="api_base",
	type=str,
	help="specify base url other than the OpenAI's official API address",
	)
	parser.add_argument(
	"--exclude_filelist",
	dest="exclude_filelist",
	type=str,
	default="",
	help="if you have more than one file to exclude, please use comma to split them, example: --exclude_filelist 'nav.xhtml,cover.xhtml'",
	)
	parser.add_argument(
	"--only_filelist",
	dest="only_filelist",
	type=str,
	default="",
	help="if you only have a few files with translations, please use comma to split them, example: --only_filelist 'nav.xhtml,cover.xhtml'",
	)
	parser.add_argument(
	"--translate-tags",
	dest="translate_tags",
	type=str,
	default="p,h1,h2,h3,h4,li,div",
	help="example --translate-tags p,blockquote",
	)
	parser.add_argument(
	"--exclude_translate-tags",
	dest="exclude_translate_tags",
	type=str,
	default="sup",
	help="example --exclude_translate-tags table,sup",
	)
	parser.add_argument(
	"--allow_navigable_strings",
	dest="allow_navigable_strings",
	action="store_true",
	default=False,
	help="allow NavigableStrings to be translated",
	)
	parser.add_argument(
	"--prompt",
	dest="prompt_arg",
	type=str,
	metavar="PROMPT_ARG",
	help="used for customizing the prompt. It can be the prompt template string, or a path to the template file. The valid placeholders are `{text}` and `{language}`.",
	)
	parser.add_argument(
	"--accumulated_num",
	dest="accumulated_num",
	type=int,
	default=1,
	help="""Wait for how many tokens have been accumulated before starting the translation.
	gpt3.5 limits the total_token to 4090.
	For example, if you use --accumulated_num 1600, maybe openai will output 2200 tokens
	and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000,
	So you are close to reaching the limit. You have to choose your own value, there is no way to know if the limit is reached before sending
	""",
	)
	parser.add_argument(
	"--translation_style",
	dest="translation_style",
	type=str,
	help="""ex: --translation_style "color: #808080; font-style: italic;" """,
	)
	parser.add_argument(
	"--batch_size",
	dest="batch_size",
	type=int,
	help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
	)
	parser.add_argument(
	"--checkpoint_interval",
	dest="checkpoint_interval",
	type=int,
	default=50,
	help="save EPUB resume checkpoint after this many processed units",
	)
	parser.add_argument(
	"--skip_backmatter_after_percent",
	dest="skip_backmatter_after_percent",
	type=int,
	default=50,
	help="after this percent of EPUB progress, backmatter chapters can be skipped",
	)
	parser.add_argument(
	"--skip_backmatter_titles",
	dest="skip_backmatter_titles",
	type=str,
	default=",".join(DEFAULT_BACKMATTER_TITLES),
	help="comma-separated EPUB backmatter chapter titles to stop translating after the threshold",
	)
	parser.add_argument(
	"--review_mode",
	dest="review_mode",
	type=str,
	default="suspicious_only",
	choices=["always", "suspicious_only", "never"],
	help="review strategy for gemini translator",
	)
	parser.add_argument(
	"--review_min_chinese_ratio",
	dest="review_min_chinese_ratio",
	type=float,
	default=0.2,
	help="minimum chinese character ratio before gemini review is skipped",
	)
	parser.add_argument(
	"--review_length_ratio_min",
	dest="review_length_ratio_min",
	type=float,
	default=0.35,
	help="minimum translated/source length ratio before gemini review is triggered",
	)
	parser.add_argument(
	"--review_length_ratio_max",
	dest="review_length_ratio_max",
	type=float,
	default=2.5,
	help="maximum translated/source length ratio before gemini review is triggered",
	)
	parser.add_argument(
	"--retranslate",
	dest="retranslate",
	nargs=4,
	type=str,
	help="""--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)
	Retranslate from start_str to end_str's tag:
	python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'
	Retranslate start_str's tag:
	python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'
	""",
	)
	parser.add_argument(
	"--single_translate",
	action="store_true",
	help="output translated book, no bilingual",
	)
	parser.add_argument(
	"--use_context",
	dest="context_flag",
	action="store_true",
	help="adds an additional paragraph for global, updating historical context of the story to the model's input, improving the narrative consistency for the AI model (this uses ~200 more tokens each time)",
	)
	parser.add_argument(
	"--temperature",
	type=float,
	default=1.0,
	help="temperature parameter for `chatgptapi`/`gpt4`/`claude`",
	)

	options = parser.parse_args()

	if not os.path.isfile(options.book_name):
	print(f"Error: {options.book_name} does not exist.")
	exit(1)

	PROXY = options.proxy
	if PROXY != "":
	os.environ["http_proxy"] = PROXY
	os.environ["https_proxy"] = PROXY

	translate_model = MODEL_DICT.get(options.model)
	assert translate_model is not None, "unsupported model"
	API_KEY = ""
	if options.model in ["chatgptapi", "gpt4"]:
	if OPENAI_API_KEY := (
	options.openai_key
	or env.get(
	"OPENAI_API_KEY",
	) # XXX: for backward compatibility, deprecate soon
	or env.get(
	"BBM_OPENAI_API_KEY",
	) # suggest adding `BBM_` prefix for all the bilingual_book_maker ENVs.
	):
	API_KEY = OPENAI_API_KEY
	# patch
	else:
	raise Exception(
	"OpenAI API key not provided, please google how to obtain it",
	)
	elif options.model == "caiyun":
	API_KEY = options.caiyun_key or env.get("BBM_CAIYUN_API_KEY")
	if not API_KEY:
	raise Exception("Please provide caiyun key")
	elif options.model == "deepl":
	API_KEY = options.deepl_key or env.get("BBM_DEEPL_API_KEY")
	if not API_KEY:
	raise Exception("Please provide deepl key")
	elif options.model == "claude":
	API_KEY = options.claude_key or env.get("BBM_CLAUDE_API_KEY")
	if not API_KEY:
	raise Exception("Please provide claude key")
	elif options.model == "customapi":
	API_KEY = options.custom_api or env.get("BBM_CUSTOM_API")
	if not API_KEY:
	raise Exception("Please provide custom translate api")
	elif options.model == "gemini":
	API_KEY = options.gemini_key or env.get("BBM_GOOGLE_GEMINI_KEY")
	else:
	API_KEY = ""

	if options.book_from == "kobo":
	from book_maker import obok

	device_path = options.device_path
	if device_path is None:
	raise Exception(
	"Device path is not given, please specify the path by --device_path <DEVICE_PATH>",
	)
	options.book_name = obok.cli_main(device_path)

	book_type = options.book_name.split(".")[-1]
	support_type_list = list(BOOK_LOADER_DICT.keys())
	if book_type not in support_type_list:
	raise Exception(
	f"now only support files of these formats: {','.join(support_type_list)}",
	)

	book_loader = BOOK_LOADER_DICT.get(book_type)
	assert book_loader is not None, "unsupported loader"
	language = options.language
	if options.language in LANGUAGES:
	# use the value for prompt
	language = LANGUAGES.get(language, language)

	# change api_base for issue #42
	model_api_base = options.api_base

	e = book_loader(
	options.book_name,
	translate_model,
	API_KEY,
	options.resume,
	language=language,
	model_api_base=model_api_base,
	is_test=options.test,
	test_num=options.test_num,
	prompt_config=parse_prompt_arg(options.prompt_arg),
	single_translate=options.single_translate,
	context_flag=options.context_flag,
	temperature=options.temperature,
	)
	# other options
	if options.allow_navigable_strings:
	e.allow_navigable_strings = True
	if options.translate_tags:
	e.translate_tags = options.translate_tags
	if options.exclude_translate_tags:
	e.exclude_translate_tags = options.exclude_translate_tags
	if options.exclude_filelist:
	e.exclude_filelist = options.exclude_filelist
	if options.only_filelist:
	e.only_filelist = options.only_filelist
	if options.accumulated_num > 1:
	e.accumulated_num = options.accumulated_num
	if options.translation_style:
	e.translation_style = options.translation_style
	if options.batch_size:
	e.batch_size = options.batch_size
	if options.checkpoint_interval is not None:
	e.checkpoint_interval = options.checkpoint_interval
	e.skip_backmatter_after_percent = max(0, options.skip_backmatter_after_percent)
	e.skip_backmatter_titles = split_backmatter_titles(options.skip_backmatter_titles)
	if options.model == "gemini":
	e.translate_model.review_mode = options.review_mode
	e.translate_model.review_min_chinese_ratio = options.review_min_chinese_ratio
	e.translate_model.review_length_ratio_min = options.review_length_ratio_min
	e.translate_model.review_length_ratio_max = options.review_length_ratio_max
	if options.retranslate:
	e.retranslate = options.retranslate
	if options.deployment_id:
	# only work for ChatGPT api for now
	# later maybe support others
	assert options.model in [
	"chatgptapi",
	"gpt4",
	], "only support chatgptapi for deployment_id"
	if not options.api_base:
	raise ValueError("`api_base` must be provided when using `deployment_id`")
	e.translate_model.set_deployment_id(options.deployment_id)
	# TODO refactor, quick fix for gpt4 model
	if options.model == "gpt4":
	e.translate_model.set_gpt4_models("gpt4")

	e.make_bilingual_book()


	if __name__ == "__main__":
	main()