Spaces:

fromozu
/

ebook-executor

Paused

App Files Files Community

fromozu commited on May 4

Commit

2de84f5

verified ·

1 Parent(s): 75e2413

Upload bilingual_book_maker/book_maker/cli.py with huggingface_hub

Browse files

Files changed (1) hide show

bilingual_book_maker/book_maker/cli.py +462 -0

bilingual_book_maker/book_maker/cli.py ADDED Viewed

	@@ -0,0 +1,462 @@

+import argparse
+import json
+import os
+from os import environ as env
+from book_maker.backmatter import DEFAULT_BACKMATTER_TITLES, split_backmatter_titles
+from book_maker.loader import BOOK_LOADER_DICT
+from book_maker.translator import MODEL_DICT
+from book_maker.utils import LANGUAGES, TO_LANGUAGE_CODE
+def parse_prompt_arg(prompt_arg):
+    prompt = None
+    if prompt_arg is None:
+        return prompt
+    if not any(prompt_arg.endswith(ext) for ext in [".json", ".txt"]):
+        try:
+            # user can define prompt by passing a json string
+            # eg: --prompt '{"system": "You are a professional translator who translates computer technology books", "user": "Translate \`{text}\` to {language}"}'
+            prompt = json.loads(prompt_arg)
+        except json.JSONDecodeError:
+            # if not a json string, treat it as a template string
+            prompt = {"user": prompt_arg}
+    elif os.path.exists(prompt_arg):
+        if prompt_arg.endswith(".txt"):
+            # if it's a txt file, treat it as a template string
+            with open(prompt_arg, encoding="utf-8") as f:
+                prompt = {"user": f.read()}
+        elif prompt_arg.endswith(".json"):
+            # if it's a json file, treat it as a json object
+            # eg: --prompt prompt_template_sample.json
+            with open(prompt_arg, encoding="utf-8") as f:
+                prompt = json.load(f)
+    else:
+        raise FileNotFoundError(f"{prompt_arg} not found")
+    if prompt is None or any(c not in prompt["user"] for c in ["{text}", "{language}"]):
+        raise ValueError("prompt must contain `{text}` and `{language}`")
+    if "user" not in prompt:
+        raise ValueError("prompt must contain the key of `user`")
+    if (prompt.keys() - {"user", "system"}) != set():
+        raise ValueError("prompt can only contain the keys of `user` and `system`")
+    print("prompt config:", prompt)
+    return prompt
+def main():
+    translate_model_list = list(MODEL_DICT.keys())
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--book_name",
+        dest="book_name",
+        type=str,
+        help="path of the epub file to be translated",
+    )
+    parser.add_argument(
+        "--book_from",
+        dest="book_from",
+        type=str,
+        choices=["kobo"],  # support kindle later
+        metavar="E-READER",
+        help="e-reader type, available: {%(choices)s}",
+    )
+    parser.add_argument(
+        "--device_path",
+        dest="device_path",
+        type=str,
+        help="Path of e-reader device",
+    )
+    ########## KEYS ##########
+    parser.add_argument(
+        "--openai_key",
+        dest="openai_key",
+        type=str,
+        default="",
+        help="OpenAI api key,if you have more than one key, please use comma"
+        " to split them to go beyond the rate limits",
+    )
+    parser.add_argument(
+        "--caiyun_key",
+        dest="caiyun_key",
+        type=str,
+        help="you can apply caiyun key from here (https://dashboard.caiyunapp.com/user/sign_in/)",
+    )
+    parser.add_argument(
+        "--deepl_key",
+        dest="deepl_key",
+        type=str,
+        help="you can apply deepl key from here (https://rapidapi.com/splintPRO/api/dpl-translator",
+    )
+    parser.add_argument(
+        "--claude_key",
+        dest="claude_key",
+        type=str,
+        help="you can find claude key from here (https://console.anthropic.com/account/keys)",
+    )
+    parser.add_argument(
+        "--custom_api",
+        dest="custom_api",
+        type=str,
+        help="you should build your own translation api",
+    )
+    # for Google Gemini
+    parser.add_argument(
+        "--gemini_key",
+        dest="gemini_key",
+        type=str,
+        help="You can get Gemini Key from  https://makersuite.google.com/app/apikey",
+    )
+    parser.add_argument(
+        "--test",
+        dest="test",
+        action="store_true",
+        help="only the first 10 paragraphs will be translated, for testing",
+    )
+    parser.add_argument(
+        "--test_num",
+        dest="test_num",
+        type=int,
+        default=10,
+        help="how many paragraphs will be translated for testing",
+    )
+    parser.add_argument(
+        "-m",
+        "--model",
+        dest="model",
+        type=str,
+        default="chatgptapi",
+        choices=translate_model_list,  # support DeepL later
+        metavar="MODEL",
+        help="model to use, available: {%(choices)s}",
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        choices=sorted(LANGUAGES.keys())
+        + sorted([k.title() for k in TO_LANGUAGE_CODE]),
+        default="zh-hans",
+        metavar="LANGUAGE",
+        help="language to translate to, available: {%(choices)s}",
+    )
+    parser.add_argument(
+        "--resume",
+        dest="resume",
+        action="store_true",
+        help="if program stop unexpected you can use this to resume",
+    )
+    parser.add_argument(
+        "-p",
+        "--proxy",
+        dest="proxy",
+        type=str,
+        default="",
+        help="use proxy like http://127.0.0.1:7890",
+    )
+    parser.add_argument(
+        "--deployment_id",
+        dest="deployment_id",
+        type=str,
+        help="the deployment name you chose when you deployed the model",
+    )
+    # args to change api_base
+    parser.add_argument(
+        "--api_base",
+        metavar="API_BASE_URL",
+        dest="api_base",
+        type=str,
+        help="specify base url other than the OpenAI's official API address",
+    )
+    parser.add_argument(
+        "--exclude_filelist",
+        dest="exclude_filelist",
+        type=str,
+        default="",
+        help="if you have more than one file to exclude, please use comma to split them, example: --exclude_filelist 'nav.xhtml,cover.xhtml'",
+    )
+    parser.add_argument(
+        "--only_filelist",
+        dest="only_filelist",
+        type=str,
+        default="",
+        help="if you only have a few files with translations, please use comma to split them, example: --only_filelist 'nav.xhtml,cover.xhtml'",
+    )
+    parser.add_argument(
+        "--translate-tags",
+        dest="translate_tags",
+        type=str,
+        default="p,h1,h2,h3,h4,li,div",
+        help="example --translate-tags p,blockquote",
+    )
+    parser.add_argument(
+        "--exclude_translate-tags",
+        dest="exclude_translate_tags",
+        type=str,
+        default="sup",
+        help="example --exclude_translate-tags table,sup",
+    )
+    parser.add_argument(
+        "--allow_navigable_strings",
+        dest="allow_navigable_strings",
+        action="store_true",
+        default=False,
+        help="allow NavigableStrings to be translated",
+    )
+    parser.add_argument(
+        "--prompt",
+        dest="prompt_arg",
+        type=str,
+        metavar="PROMPT_ARG",
+        help="used for customizing the prompt. It can be the prompt template string, or a path to the template file. The valid placeholders are `{text}` and `{language}`.",
+    )
+    parser.add_argument(
+        "--accumulated_num",
+        dest="accumulated_num",
+        type=int,
+        default=1,
+        help="""Wait for how many tokens have been accumulated before starting the translation.
+gpt3.5 limits the total_token to 4090.
+For example, if you use --accumulated_num 1600, maybe openai will output 2200 tokens
+and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000,
+So you are close to reaching the limit. You have to choose your own value, there is no way to know if the limit is reached before sending
+""",
+    )
+    parser.add_argument(
+        "--translation_style",
+        dest="translation_style",
+        type=str,
+        help="""ex: --translation_style "color: #808080; font-style: italic;" """,
+    )
+    parser.add_argument(
+        "--batch_size",
+        dest="batch_size",
+        type=int,
+        help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
+    )
+    parser.add_argument(
+        "--checkpoint_interval",
+        dest="checkpoint_interval",
+        type=int,
+        default=50,
+        help="save EPUB resume checkpoint after this many processed units",
+    )
+    parser.add_argument(
+        "--skip_backmatter_after_percent",
+        dest="skip_backmatter_after_percent",
+        type=int,
+        default=50,
+        help="after this percent of EPUB progress, backmatter chapters can be skipped",
+    )
+    parser.add_argument(
+        "--skip_backmatter_titles",
+        dest="skip_backmatter_titles",
+        type=str,
+        default=",".join(DEFAULT_BACKMATTER_TITLES),
+        help="comma-separated EPUB backmatter chapter titles to stop translating after the threshold",
+    )
+    parser.add_argument(
+        "--review_mode",
+        dest="review_mode",
+        type=str,
+        default="suspicious_only",
+        choices=["always", "suspicious_only", "never"],
+        help="review strategy for gemini translator",
+    )
+    parser.add_argument(
+        "--review_min_chinese_ratio",
+        dest="review_min_chinese_ratio",
+        type=float,
+        default=0.2,
+        help="minimum chinese character ratio before gemini review is skipped",
+    )
+    parser.add_argument(
+        "--review_length_ratio_min",
+        dest="review_length_ratio_min",
+        type=float,
+        default=0.35,
+        help="minimum translated/source length ratio before gemini review is triggered",
+    )
+    parser.add_argument(
+        "--review_length_ratio_max",
+        dest="review_length_ratio_max",
+        type=float,
+        default=2.5,
+        help="maximum translated/source length ratio before gemini review is triggered",
+    )
+    parser.add_argument(
+        "--retranslate",
+        dest="retranslate",
+        nargs=4,
+        type=str,
+        help="""--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)
+        Retranslate from start_str to end_str's tag:
+        python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'
+        Retranslate start_str's tag:
+        python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'
+""",
+    )
+    parser.add_argument(
+        "--single_translate",
+        action="store_true",
+        help="output translated book, no bilingual",
+    )
+    parser.add_argument(
+        "--use_context",
+        dest="context_flag",
+        action="store_true",
+        help="adds an additional paragraph for global, updating historical context of the story to the model's input, improving the narrative consistency for the AI model (this uses ~200 more tokens each time)",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="temperature parameter for `chatgptapi`/`gpt4`/`claude`",
+    )
+    options = parser.parse_args()
+    if not os.path.isfile(options.book_name):
+        print(f"Error: {options.book_name} does not exist.")
+        exit(1)
+    PROXY = options.proxy
+    if PROXY != "":
+        os.environ["http_proxy"] = PROXY
+        os.environ["https_proxy"] = PROXY
+    translate_model = MODEL_DICT.get(options.model)
+    assert translate_model is not None, "unsupported model"
+    API_KEY = ""
+    if options.model in ["chatgptapi", "gpt4"]:
+        if OPENAI_API_KEY := (
+            options.openai_key
+            or env.get(
+                "OPENAI_API_KEY",
+            )  # XXX: for backward compatibility, deprecate soon
+            or env.get(
+                "BBM_OPENAI_API_KEY",
+            )  # suggest adding `BBM_` prefix for all the bilingual_book_maker ENVs.
+        ):
+            API_KEY = OPENAI_API_KEY
+            # patch
+        else:
+            raise Exception(
+                "OpenAI API key not provided, please google how to obtain it",
+            )
+    elif options.model == "caiyun":
+        API_KEY = options.caiyun_key or env.get("BBM_CAIYUN_API_KEY")
+        if not API_KEY:
+            raise Exception("Please provide caiyun key")
+    elif options.model == "deepl":
+        API_KEY = options.deepl_key or env.get("BBM_DEEPL_API_KEY")
+        if not API_KEY:
+            raise Exception("Please provide deepl key")
+    elif options.model == "claude":
+        API_KEY = options.claude_key or env.get("BBM_CLAUDE_API_KEY")
+        if not API_KEY:
+            raise Exception("Please provide claude key")
+    elif options.model == "customapi":
+        API_KEY = options.custom_api or env.get("BBM_CUSTOM_API")
+        if not API_KEY:
+            raise Exception("Please provide custom translate api")
+    elif options.model == "gemini":
+        API_KEY = options.gemini_key or env.get("BBM_GOOGLE_GEMINI_KEY")
+    else:
+        API_KEY = ""
+    if options.book_from == "kobo":
+        from book_maker import obok
+        device_path = options.device_path
+        if device_path is None:
+            raise Exception(
+                "Device path is not given, please specify the path by --device_path <DEVICE_PATH>",
+            )
+        options.book_name = obok.cli_main(device_path)
+    book_type = options.book_name.split(".")[-1]
+    support_type_list = list(BOOK_LOADER_DICT.keys())
+    if book_type not in support_type_list:
+        raise Exception(
+            f"now only support files of these formats: {','.join(support_type_list)}",
+        )
+    book_loader = BOOK_LOADER_DICT.get(book_type)
+    assert book_loader is not None, "unsupported loader"
+    language = options.language
+    if options.language in LANGUAGES:
+        # use the value for prompt
+        language = LANGUAGES.get(language, language)
+    # change api_base for issue #42
+    model_api_base = options.api_base
+    e = book_loader(
+        options.book_name,
+        translate_model,
+        API_KEY,
+        options.resume,
+        language=language,
+        model_api_base=model_api_base,
+        is_test=options.test,
+        test_num=options.test_num,
+        prompt_config=parse_prompt_arg(options.prompt_arg),
+        single_translate=options.single_translate,
+        context_flag=options.context_flag,
+        temperature=options.temperature,
+    )
+    # other options
+    if options.allow_navigable_strings:
+        e.allow_navigable_strings = True
+    if options.translate_tags:
+        e.translate_tags = options.translate_tags
+    if options.exclude_translate_tags:
+        e.exclude_translate_tags = options.exclude_translate_tags
+    if options.exclude_filelist:
+        e.exclude_filelist = options.exclude_filelist
+    if options.only_filelist:
+        e.only_filelist = options.only_filelist
+    if options.accumulated_num > 1:
+        e.accumulated_num = options.accumulated_num
+    if options.translation_style:
+        e.translation_style = options.translation_style
+    if options.batch_size:
+        e.batch_size = options.batch_size
+    if options.checkpoint_interval is not None:
+        e.checkpoint_interval = options.checkpoint_interval
+    e.skip_backmatter_after_percent = max(0, options.skip_backmatter_after_percent)
+    e.skip_backmatter_titles = split_backmatter_titles(options.skip_backmatter_titles)
+    if options.model == "gemini":
+        e.translate_model.review_mode = options.review_mode
+        e.translate_model.review_min_chinese_ratio = options.review_min_chinese_ratio
+        e.translate_model.review_length_ratio_min = options.review_length_ratio_min
+        e.translate_model.review_length_ratio_max = options.review_length_ratio_max
+    if options.retranslate:
+        e.retranslate = options.retranslate
+    if options.deployment_id:
+        # only work for ChatGPT api for now
+        # later maybe support others
+        assert options.model in [
+            "chatgptapi",
+            "gpt4",
+        ], "only support chatgptapi for deployment_id"
+        if not options.api_base:
+            raise ValueError("`api_base` must be provided when using `deployment_id`")
+        e.translate_model.set_deployment_id(options.deployment_id)
+    # TODO refactor, quick fix for gpt4 model
+    if options.model == "gpt4":
+        e.translate_model.set_gpt4_models("gpt4")
+    e.make_bilingual_book()
+if __name__ == "__main__":
+    main()