davideuler
model default to default_model for mlx/gguf models, and add new text as optional layer; allow --no-original option to remove original content in pdf, just save the translated content
4a4fa23
| import argparse | |
| import pymupdf | |
| from deep_translator import ( | |
| GoogleTranslator, | |
| ChatGptTranslator, | |
| ) | |
| # Map of supported translators | |
| TRANSLATORS = { | |
| 'google': GoogleTranslator, | |
| 'chatgpt': ChatGptTranslator, | |
| } | |
| def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: str = "Text", | |
| translator_name: str = "google", text_color: str = "darkred", keep_original: bool = True): | |
| """ | |
| Translate a PDF file from source language to target language | |
| Args: | |
| input_file: Path to input PDF file | |
| source_lang: Source language code (e.g. 'en', 'fr') | |
| target_lang: Target language code (e.g. 'ko', 'ja') | |
| layer: Name of the OCG layer (default: "Text") | |
| translator_name: Name of the translator to use (default: "google") | |
| text_color: Color of translated text (default: "darkred") | |
| keep_original: Whether to keep original text visible (default: True) | |
| """ | |
| # Define colors | |
| WHITE = pymupdf.pdfcolor["white"] | |
| # Color mapping | |
| COLOR_MAP = { | |
| "darkred": (0.8, 0, 0), | |
| "black": (0, 0, 0), | |
| "blue": (0, 0, 0.8), | |
| "darkgreen": (0, 0.5, 0), | |
| "purple": (0.5, 0, 0.5), | |
| } | |
| # Get RGB color values, default to darkred if color not found | |
| rgb_color = COLOR_MAP.get(text_color.lower(), COLOR_MAP["darkred"]) | |
| # This flag ensures that text will be dehyphenated after extraction. | |
| textflags = pymupdf.TEXT_DEHYPHENATE | |
| # Get the translator class | |
| if translator_name not in TRANSLATORS: | |
| raise ValueError(f"Unsupported translator: {translator_name}. Available translators: {', '.join(TRANSLATORS.keys())}") | |
| TranslatorClass = TRANSLATORS[translator_name] | |
| # Configure the translator | |
| translator = TranslatorClass(source=source_lang, target=target_lang) | |
| # Generate output filename | |
| output_file = input_file.rsplit('.', 1)[0] + f'-{target_lang}.pdf' | |
| # Open the document | |
| doc = pymupdf.open(input_file) | |
| # Define an Optional Content layer for translation | |
| ocg_trans = doc.add_ocg(layer, on=True) | |
| # If not keeping original, create a layer for original text and hide it | |
| if not keep_original: | |
| ocg_orig = doc.add_ocg("Original", on=False) | |
| # Iterate over all pages | |
| for page in doc: | |
| # Extract text grouped like lines in a paragraph. | |
| blocks = page.get_text("blocks", flags=textflags) | |
| # Every block of text is contained in a rectangle ("bbox") | |
| for block in blocks: | |
| bbox = block[:4] # area containing the text | |
| text = block[4] # the text of this block | |
| # Invoke the actual translation | |
| translated = translator.translate(text) | |
| if not keep_original: | |
| # Move original text to hidden layer | |
| page.insert_htmlbox( | |
| bbox, | |
| text, | |
| css="* {font-family: sans-serif;}", | |
| oc=ocg_orig | |
| ) | |
| # Clear original text area in base layer | |
| page.draw_rect(bbox, color=None, fill=WHITE) | |
| else: | |
| # Cover the original text only in translation layer | |
| page.draw_rect(bbox, color=None, fill=WHITE, oc=ocg_trans) | |
| # Write the translated text in specified color | |
| page.insert_htmlbox( | |
| bbox, | |
| translated, | |
| css=f"* {{font-family: sans-serif; color: rgb({int(rgb_color[0]*255)}, {int(rgb_color[1]*255)}, {int(rgb_color[2]*255)});}}", | |
| oc=ocg_trans | |
| ) | |
| doc.subset_fonts() | |
| doc.ez_save(output_file) | |
| print(f"Translated PDF saved as: {output_file}") | |
| def main(): | |
| """ | |
| can be invoked like this: | |
| ``` | |
| # Basic usage | |
| python translator_cli.py --source english --target zh-CN input.pdf | |
| # With custom color and hiding original text | |
| python translator_cli.py --source english --target zh-CN --color blue --no-original input.pdf | |
| # Using ChatGPT translator | |
| export OPENAI_API_KEY=sk-proj-xxxx | |
| export OPENAI_API_BASE=https://api.xxxx.com/v1 | |
| export OPENAI_API_BASE=http://localhost:8080/v1 # for local llm api | |
| python translator_cli.py --source english --translator chatgpt --target zh-CN input.pdf | |
| # do not keep original text as an optional layer: | |
| python translator_cli.py --source english --translator chatgpt --target zh-CN --no-original input.pdf | |
| ``` | |
| The translated content is an optional content layer in the new PDF file. | |
| The optional layer can be hidden in Acrobat PDF Reader and Foxit Reader. | |
| """ | |
| parser = argparse.ArgumentParser(description='Translate PDF documents.') | |
| parser.add_argument('input_file', help='Input PDF file path') | |
| parser.add_argument('--source', '-s', default='en', | |
| help='Source language code (default: en)') | |
| parser.add_argument('--target', '-t', default='zh-CN', | |
| help='Target language code (default: zh-CN)') | |
| parser.add_argument('--layer', '-l', default='Text', | |
| help='Name of the OCG layer (default: Text)') | |
| parser.add_argument('--translator', '-tr', default='google', | |
| choices=list(TRANSLATORS.keys()), | |
| help='Translator to use (default: google)') | |
| parser.add_argument('--color', '-c', default='darkred', | |
| choices=['darkred', 'black', 'blue', 'darkgreen', 'purple'], | |
| help='Color of translated text (default: darkred)') | |
| parser.add_argument('--no-original', action='store_true', | |
| help='Do not keep original text in base layer (default: False)') | |
| args = parser.parse_args() | |
| try: | |
| translate_pdf(args.input_file, args.source, args.target, args.layer, | |
| args.translator, args.color, not args.no_original) | |
| except Exception as e: | |
| print(f"Error: {str(e)}") | |
| exit(1) | |
| if __name__ == "__main__": | |
| main() |