| | |
| | """A command line tool for extracting text and images from PDF and |
| | output it to plain text, html, xml or tags. |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | import argparse |
| | import logging |
| | import sys |
| | from string import Template |
| | from typing import List, Optional |
| |
|
| | from pdf2zh import __version__, log |
| | from pdf2zh.high_level import translate, download_remote_fonts |
| | from pdf2zh.doclayout import OnnxModel, ModelInstance |
| | import os |
| |
|
| | from pdf2zh.config import ConfigManager |
| | from babeldoc.translation_config import TranslationConfig as YadtConfig |
| | from babeldoc.high_level import async_translate as yadt_translate |
| | from babeldoc.high_level import init as yadt_init |
| | from babeldoc.main import create_progress_handler |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def create_parser() -> argparse.ArgumentParser: |
| | parser = argparse.ArgumentParser(description=__doc__, add_help=True) |
| | parser.add_argument( |
| | "files", |
| | type=str, |
| | default=None, |
| | nargs="*", |
| | help="One or more paths to PDF files.", |
| | ) |
| | parser.add_argument( |
| | "--version", |
| | "-v", |
| | action="version", |
| | version=f"pdf2zh v{__version__}", |
| | ) |
| | parser.add_argument( |
| | "--debug", |
| | "-d", |
| | default=False, |
| | action="store_true", |
| | help="Use debug logging level.", |
| | ) |
| | parse_params = parser.add_argument_group( |
| | "Parser", |
| | description="Used during PDF parsing", |
| | ) |
| | parse_params.add_argument( |
| | "--pages", |
| | "-p", |
| | type=str, |
| | help="The list of page numbers to parse.", |
| | ) |
| | parse_params.add_argument( |
| | "--vfont", |
| | "-f", |
| | type=str, |
| | default="", |
| | help="The regex to math font name of formula.", |
| | ) |
| | parse_params.add_argument( |
| | "--vchar", |
| | "-c", |
| | type=str, |
| | default="", |
| | help="The regex to math character of formula.", |
| | ) |
| | parse_params.add_argument( |
| | "--lang-in", |
| | "-li", |
| | type=str, |
| | default="en", |
| | help="The code of source language.", |
| | ) |
| | parse_params.add_argument( |
| | "--lang-out", |
| | "-lo", |
| | type=str, |
| | default="zh", |
| | help="The code of target language.", |
| | ) |
| | parse_params.add_argument( |
| | "--service", |
| | "-s", |
| | type=str, |
| | default="google", |
| | help="The service to use for translation.", |
| | ) |
| | parse_params.add_argument( |
| | "--output", |
| | "-o", |
| | type=str, |
| | default="", |
| | help="Output directory for files.", |
| | ) |
| | parse_params.add_argument( |
| | "--thread", |
| | "-t", |
| | type=int, |
| | default=4, |
| | help="The number of threads to execute translation.", |
| | ) |
| | parse_params.add_argument( |
| | "--interactive", |
| | "-i", |
| | action="store_true", |
| | help="Interact with GUI.", |
| | ) |
| | parse_params.add_argument( |
| | "--share", |
| | action="store_true", |
| | help="Enable Gradio Share", |
| | ) |
| | parse_params.add_argument( |
| | "--flask", |
| | action="store_true", |
| | help="flask", |
| | ) |
| | parse_params.add_argument( |
| | "--celery", |
| | action="store_true", |
| | help="celery", |
| | ) |
| | parse_params.add_argument( |
| | "--authorized", |
| | type=str, |
| | nargs="+", |
| | help="user name and password.", |
| | ) |
| | parse_params.add_argument( |
| | "--prompt", |
| | type=str, |
| | help="user custom prompt.", |
| | ) |
| |
|
| | parse_params.add_argument( |
| | "--compatible", |
| | "-cp", |
| | action="store_true", |
| | help="Convert the PDF file into PDF/A format to improve compatibility.", |
| | ) |
| |
|
| | parse_params.add_argument( |
| | "--onnx", |
| | type=str, |
| | help="custom onnx model path.", |
| | ) |
| |
|
| | parse_params.add_argument( |
| | "--serverport", |
| | type=int, |
| | help="custom WebUI port.", |
| | ) |
| |
|
| | parse_params.add_argument( |
| | "--dir", |
| | action="store_true", |
| | help="translate directory.", |
| | ) |
| |
|
| | parse_params.add_argument( |
| | "--config", |
| | type=str, |
| | help="config file.", |
| | ) |
| |
|
| | parse_params.add_argument( |
| | "--babeldoc", |
| | default=False, |
| | action="store_true", |
| | help="Use experimental backend babeldoc.", |
| | ) |
| |
|
| | parse_params.add_argument( |
| | "--skip-subset-fonts", |
| | action="store_true", |
| | help="Skip font subsetting. " |
| | "This option can improve compatibility " |
| | "but will increase the size of the output file.", |
| | ) |
| |
|
| | parse_params.add_argument( |
| | "--ignore-cache", |
| | action="store_true", |
| | help="Ignore cache and force retranslation.", |
| | ) |
| |
|
| | return parser |
| |
|
| |
|
| | def parse_args(args: Optional[List[str]]) -> argparse.Namespace: |
| | parsed_args = create_parser().parse_args(args=args) |
| |
|
| | if parsed_args.pages: |
| | pages = [] |
| | for p in parsed_args.pages.split(","): |
| | if "-" in p: |
| | start, end = p.split("-") |
| | pages.extend(range(int(start) - 1, int(end))) |
| | else: |
| | pages.append(int(p) - 1) |
| | parsed_args.raw_pages = parsed_args.pages |
| | parsed_args.pages = pages |
| |
|
| | return parsed_args |
| |
|
| |
|
| | def find_all_files_in_directory(directory_path): |
| | """ |
| | Recursively search all PDF files in the given directory and return their paths as a list. |
| | |
| | :param directory_path: str, the path to the directory to search |
| | :return: list of PDF file paths |
| | """ |
| | |
| | if not os.path.isdir(directory_path): |
| | raise ValueError(f"The provided path '{directory_path}' is not a directory.") |
| |
|
| | file_paths = [] |
| |
|
| | |
| | for root, _, files in os.walk(directory_path): |
| | for file in files: |
| | |
| | if file.lower().endswith(".pdf"): |
| | |
| | file_paths.append(os.path.join(root, file)) |
| |
|
| | return file_paths |
| |
|
| |
|
| | def main(args: Optional[List[str]] = None) -> int: |
| | from rich.logging import RichHandler |
| |
|
| | logging.basicConfig(level=logging.INFO, handlers=[RichHandler()]) |
| |
|
| | |
| | logging.getLogger("httpx").setLevel("CRITICAL") |
| | logging.getLogger("httpx").propagate = False |
| | logging.getLogger("openai").setLevel("CRITICAL") |
| | logging.getLogger("openai").propagate = False |
| | logging.getLogger("httpcore").setLevel("CRITICAL") |
| | logging.getLogger("httpcore").propagate = False |
| | logging.getLogger("http11").setLevel("CRITICAL") |
| | logging.getLogger("http11").propagate = False |
| |
|
| | parsed_args = parse_args(args) |
| |
|
| | if parsed_args.config: |
| | ConfigManager.custome_config(parsed_args.config) |
| |
|
| | if parsed_args.debug: |
| | log.setLevel(logging.DEBUG) |
| |
|
| | if parsed_args.onnx: |
| | ModelInstance.value = OnnxModel(parsed_args.onnx) |
| | else: |
| | ModelInstance.value = OnnxModel.load_available() |
| |
|
| | if parsed_args.interactive: |
| | from pdf2zh.gui import setup_gui |
| |
|
| | if parsed_args.serverport: |
| | setup_gui( |
| | parsed_args.share, parsed_args.authorized, int(parsed_args.serverport) |
| | ) |
| | else: |
| | setup_gui(parsed_args.share, parsed_args.authorized) |
| | return 0 |
| |
|
| | if parsed_args.flask: |
| | from pdf2zh.backend import flask_app |
| |
|
| | flask_app.run(port=11008) |
| | return 0 |
| |
|
| | if parsed_args.celery: |
| | from pdf2zh.backend import celery_app |
| |
|
| | celery_app.start(argv=sys.argv[2:]) |
| | return 0 |
| |
|
| | if parsed_args.prompt: |
| | try: |
| | with open(parsed_args.prompt, "r", encoding="utf-8") as file: |
| | content = file.read() |
| | parsed_args.prompt = Template(content) |
| | except Exception: |
| | raise ValueError("prompt error.") |
| |
|
| | print(parsed_args) |
| | if parsed_args.babeldoc: |
| | return yadt_main(parsed_args) |
| | if parsed_args.dir: |
| | untranlate_file = find_all_files_in_directory(parsed_args.files[0]) |
| | parsed_args.files = untranlate_file |
| | translate(model=ModelInstance.value, **vars(parsed_args)) |
| | return 0 |
| |
|
| | translate(model=ModelInstance.value, **vars(parsed_args)) |
| | return 0 |
| |
|
| |
|
| | def yadt_main(parsed_args) -> int: |
| | if parsed_args.dir: |
| | untranlate_file = find_all_files_in_directory(parsed_args.files[0]) |
| | else: |
| | untranlate_file = parsed_args.files |
| | lang_in = parsed_args.lang_in |
| | lang_out = parsed_args.lang_out |
| | ignore_cache = parsed_args.ignore_cache |
| | outputdir = None |
| | if parsed_args.output: |
| | outputdir = parsed_args.output |
| |
|
| | |
| | yadt_init() |
| | font_path = download_remote_fonts(lang_out.lower()) |
| |
|
| | param = parsed_args.service.split(":", 1) |
| | service_name = param[0] |
| | service_model = param[1] if len(param) > 1 else None |
| |
|
| | envs = {} |
| | prompt = [] |
| |
|
| | if parsed_args.prompt: |
| | try: |
| | with open(parsed_args.prompt, "r", encoding="utf-8") as file: |
| | content = file.read() |
| | prompt = Template(content) |
| | except Exception: |
| | raise ValueError("prompt error.") |
| |
|
| | from pdf2zh.translator import ( |
| | AzureOpenAITranslator, |
| | GoogleTranslator, |
| | BingTranslator, |
| | DeepLTranslator, |
| | DeepLXTranslator, |
| | OllamaTranslator, |
| | OpenAITranslator, |
| | ZhipuTranslator, |
| | ModelScopeTranslator, |
| | SiliconTranslator, |
| | GeminiTranslator, |
| | AzureTranslator, |
| | TencentTranslator, |
| | DifyTranslator, |
| | AnythingLLMTranslator, |
| | XinferenceTranslator, |
| | ArgosTranslator, |
| | GrokTranslator, |
| | GroqTranslator, |
| | DeepseekTranslator, |
| | OpenAIlikedTranslator, |
| | QwenMtTranslator, |
| | ) |
| |
|
| | for translator in [ |
| | GoogleTranslator, |
| | BingTranslator, |
| | DeepLTranslator, |
| | DeepLXTranslator, |
| | OllamaTranslator, |
| | XinferenceTranslator, |
| | AzureOpenAITranslator, |
| | OpenAITranslator, |
| | ZhipuTranslator, |
| | ModelScopeTranslator, |
| | SiliconTranslator, |
| | GeminiTranslator, |
| | AzureTranslator, |
| | TencentTranslator, |
| | DifyTranslator, |
| | AnythingLLMTranslator, |
| | ArgosTranslator, |
| | GrokTranslator, |
| | GroqTranslator, |
| | DeepseekTranslator, |
| | OpenAIlikedTranslator, |
| | QwenMtTranslator, |
| | ]: |
| | if service_name == translator.name: |
| | translator = translator( |
| | lang_in, |
| | lang_out, |
| | service_model, |
| | envs=envs, |
| | prompt=prompt, |
| | ignore_cache=ignore_cache, |
| | ) |
| | break |
| | else: |
| | raise ValueError("Unsupported translation service") |
| | import asyncio |
| |
|
| | for file in untranlate_file: |
| | file = file.strip("\"'") |
| | yadt_config = YadtConfig( |
| | input_file=file, |
| | font=font_path, |
| | pages=",".join((str(x) for x in getattr(parsed_args, "raw_pages", []))), |
| | output_dir=outputdir, |
| | doc_layout_model=None, |
| | translator=translator, |
| | debug=parsed_args.debug, |
| | lang_in=lang_in, |
| | lang_out=lang_out, |
| | no_dual=False, |
| | no_mono=False, |
| | qps=parsed_args.thread, |
| | ) |
| |
|
| | async def yadt_translate_coro(yadt_config): |
| | progress_context, progress_handler = create_progress_handler(yadt_config) |
| | |
| | with progress_context: |
| | async for event in yadt_translate(yadt_config): |
| | progress_handler(event) |
| | if yadt_config.debug: |
| | logger.debug(event) |
| | if event["type"] == "finish": |
| | result = event["translate_result"] |
| | logger.info("Translation Result:") |
| | logger.info(f" Original PDF: {result.original_pdf_path}") |
| | logger.info(f" Time Cost: {result.total_seconds:.2f}s") |
| | logger.info(f" Mono PDF: {result.mono_pdf_path or 'None'}") |
| | logger.info(f" Dual PDF: {result.dual_pdf_path or 'None'}") |
| | break |
| |
|
| | asyncio.run(yadt_translate_coro(yadt_config)) |
| | return 0 |
| |
|
| |
|
| | if __name__ == "__main__": |
| | sys.exit(main()) |
| |
|