Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- pdf2zh/__init__.py +6 -0
- pdf2zh/cache.py +91 -0
- pdf2zh/converter.py +456 -0
- pdf2zh/doclayout.py +163 -0
- pdf2zh/gui.py +503 -0
- pdf2zh/high_level.py +99 -0
- pdf2zh/pdf2zh.py +325 -0
- pdf2zh/pdfinterp.py +360 -0
- pdf2zh/translator.py +347 -0
pdf2zh/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
log = logging.getLogger(__name__)
|
| 4 |
+
|
| 5 |
+
__version__ = "1.8.4"
|
| 6 |
+
__author__ = "Byaidu"
|
pdf2zh/cache.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tempfile
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
import hashlib
|
| 5 |
+
import shutil
|
| 6 |
+
|
| 7 |
+
cache_dir = os.path.join(tempfile.gettempdir(), "cache")
|
| 8 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 9 |
+
time_filename = "update_time"
|
| 10 |
+
max_cache = 5
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def deterministic_hash(obj):
|
| 14 |
+
hash_object = hashlib.sha256()
|
| 15 |
+
hash_object.update(str(obj).encode())
|
| 16 |
+
return hash_object.hexdigest()[0:20]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def get_dirs():
|
| 20 |
+
dirs = [
|
| 21 |
+
os.path.join(cache_dir, dir)
|
| 22 |
+
for dir in os.listdir(cache_dir)
|
| 23 |
+
if os.path.isdir(os.path.join(cache_dir, dir))
|
| 24 |
+
]
|
| 25 |
+
return dirs
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def get_time(dir):
|
| 29 |
+
try:
|
| 30 |
+
timefile = os.path.join(dir, time_filename)
|
| 31 |
+
t = float(open(timefile, encoding="utf-8").read())
|
| 32 |
+
return t
|
| 33 |
+
except FileNotFoundError:
|
| 34 |
+
# handle the error as needed, for now we'll just return a default value
|
| 35 |
+
return float(
|
| 36 |
+
"inf"
|
| 37 |
+
) # This ensures that this directory will be the first to be removed if required
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def write_time(dir):
|
| 41 |
+
timefile = os.path.join(dir, time_filename)
|
| 42 |
+
t = time.time()
|
| 43 |
+
print(t, file=open(timefile, "w", encoding="utf-8"), end="")
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def argmin(iterable):
|
| 47 |
+
return min(enumerate(iterable), key=lambda x: x[1])[0]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def remove_extra():
|
| 51 |
+
dirs = get_dirs()
|
| 52 |
+
for dir in dirs:
|
| 53 |
+
if not os.path.isdir(
|
| 54 |
+
dir
|
| 55 |
+
): # This line might be redundant now, as get_dirs() ensures only directories are returned
|
| 56 |
+
os.remove(dir)
|
| 57 |
+
try:
|
| 58 |
+
get_time(dir)
|
| 59 |
+
except BaseException:
|
| 60 |
+
shutil.rmtree(dir)
|
| 61 |
+
while True:
|
| 62 |
+
dirs = get_dirs()
|
| 63 |
+
if len(dirs) <= max_cache:
|
| 64 |
+
break
|
| 65 |
+
times = [get_time(dir) for dir in dirs]
|
| 66 |
+
arg = argmin(times)
|
| 67 |
+
shutil.rmtree(dirs[arg])
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def is_cached(hash_key):
|
| 71 |
+
dir = os.path.join(cache_dir, hash_key)
|
| 72 |
+
return os.path.exists(dir)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def create_cache(hash_key):
|
| 76 |
+
dir = os.path.join(cache_dir, hash_key)
|
| 77 |
+
os.makedirs(dir, exist_ok=True)
|
| 78 |
+
write_time(dir)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def load_paragraph(hash_key, hash_key_paragraph):
|
| 82 |
+
filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
|
| 83 |
+
if os.path.exists(filename):
|
| 84 |
+
return open(filename, encoding="utf-8").read()
|
| 85 |
+
else:
|
| 86 |
+
return None
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def write_paragraph(hash_key, hash_key_paragraph, paragraph):
|
| 90 |
+
filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
|
| 91 |
+
print(paragraph, file=open(filename, "w", encoding="utf-8"), end="")
|
pdf2zh/converter.py
ADDED
|
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
|
| 2 |
+
from pdfminer.pdffont import PDFFont, PDFCIDFont
|
| 3 |
+
from pdfminer.converter import PDFConverter
|
| 4 |
+
from pdfminer.pdffont import PDFUnicodeNotDefined
|
| 5 |
+
from pdfminer.utils import apply_matrix_pt, mult_matrix
|
| 6 |
+
from pdfminer.layout import (
|
| 7 |
+
LTChar,
|
| 8 |
+
LTFigure,
|
| 9 |
+
LTLine,
|
| 10 |
+
LTPage,
|
| 11 |
+
)
|
| 12 |
+
import logging
|
| 13 |
+
import re
|
| 14 |
+
import concurrent.futures
|
| 15 |
+
import numpy as np
|
| 16 |
+
import unicodedata
|
| 17 |
+
from tenacity import retry, wait_fixed
|
| 18 |
+
from pdf2zh import cache
|
| 19 |
+
from pdf2zh.translator import (
|
| 20 |
+
BaseTranslator,
|
| 21 |
+
GoogleTranslator,
|
| 22 |
+
DeepLTranslator,
|
| 23 |
+
DeepLXTranslator,
|
| 24 |
+
OllamaTranslator,
|
| 25 |
+
OpenAITranslator,
|
| 26 |
+
AzureTranslator,
|
| 27 |
+
TencentTranslator,
|
| 28 |
+
)
|
| 29 |
+
from pymupdf import Font
|
| 30 |
+
|
| 31 |
+
log = logging.getLogger(__name__)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class PDFConverterEx(PDFConverter):
|
| 35 |
+
def __init__(
|
| 36 |
+
self,
|
| 37 |
+
rsrcmgr: PDFResourceManager,
|
| 38 |
+
) -> None:
|
| 39 |
+
PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None)
|
| 40 |
+
|
| 41 |
+
def begin_page(self, page, ctm) -> None:
|
| 42 |
+
# 重载替换 cropbox
|
| 43 |
+
(x0, y0, x1, y1) = page.cropbox
|
| 44 |
+
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
|
| 45 |
+
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
|
| 46 |
+
mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
|
| 47 |
+
self.cur_item = LTPage(page.pageno, mediabox)
|
| 48 |
+
|
| 49 |
+
def end_page(self, page):
|
| 50 |
+
# 重载返回指令流
|
| 51 |
+
return self.receive_layout(self.cur_item)
|
| 52 |
+
|
| 53 |
+
def begin_figure(self, name, bbox, matrix) -> None:
|
| 54 |
+
# 重载设置 pageid
|
| 55 |
+
self._stack.append(self.cur_item)
|
| 56 |
+
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
|
| 57 |
+
self.cur_item.pageid = self._stack[-1].pageid
|
| 58 |
+
|
| 59 |
+
def end_figure(self, _: str) -> None:
|
| 60 |
+
# 重载返回指令流
|
| 61 |
+
fig = self.cur_item
|
| 62 |
+
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
|
| 63 |
+
self.cur_item = self._stack.pop()
|
| 64 |
+
self.cur_item.add(fig)
|
| 65 |
+
return self.receive_layout(fig)
|
| 66 |
+
|
| 67 |
+
def render_char(
|
| 68 |
+
self,
|
| 69 |
+
matrix,
|
| 70 |
+
font,
|
| 71 |
+
fontsize: float,
|
| 72 |
+
scaling: float,
|
| 73 |
+
rise: float,
|
| 74 |
+
cid: int,
|
| 75 |
+
ncs,
|
| 76 |
+
graphicstate: PDFGraphicState,
|
| 77 |
+
) -> float:
|
| 78 |
+
# 重载设置 cid 和 font
|
| 79 |
+
try:
|
| 80 |
+
text = font.to_unichr(cid)
|
| 81 |
+
assert isinstance(text, str), str(type(text))
|
| 82 |
+
except PDFUnicodeNotDefined:
|
| 83 |
+
text = self.handle_undefined_char(font, cid)
|
| 84 |
+
textwidth = font.char_width(cid)
|
| 85 |
+
textdisp = font.char_disp(cid)
|
| 86 |
+
item = LTChar(
|
| 87 |
+
matrix,
|
| 88 |
+
font,
|
| 89 |
+
fontsize,
|
| 90 |
+
scaling,
|
| 91 |
+
rise,
|
| 92 |
+
text,
|
| 93 |
+
textwidth,
|
| 94 |
+
textdisp,
|
| 95 |
+
ncs,
|
| 96 |
+
graphicstate,
|
| 97 |
+
)
|
| 98 |
+
self.cur_item.add(item)
|
| 99 |
+
item.cid = cid # hack 插入原字符编码
|
| 100 |
+
item.font = font # hack 插入原字符字体
|
| 101 |
+
return item.adv
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class Paragraph:
|
| 105 |
+
def __init__(self, y, x, x0, x1, size, font, brk):
|
| 106 |
+
self.y: float = y # 初始纵坐标
|
| 107 |
+
self.x: float = x # 初始横坐标
|
| 108 |
+
self.x0: float = x0 # 左边界
|
| 109 |
+
self.x1: float = x1 # 右边界
|
| 110 |
+
self.size: float = size # 字体大小
|
| 111 |
+
self.font: PDFFont = font # 字体
|
| 112 |
+
self.brk: bool = brk # 换行标记
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# fmt: off
|
| 116 |
+
class TranslateConverter(PDFConverterEx):
|
| 117 |
+
def __init__(
|
| 118 |
+
self,
|
| 119 |
+
rsrcmgr,
|
| 120 |
+
vfont: str = None,
|
| 121 |
+
vchar: str = None,
|
| 122 |
+
thread: int = 0,
|
| 123 |
+
layout={},
|
| 124 |
+
lang_in: str = "",
|
| 125 |
+
lang_out: str = "",
|
| 126 |
+
service: str = "",
|
| 127 |
+
resfont: str = "",
|
| 128 |
+
noto: Font = None,
|
| 129 |
+
) -> None:
|
| 130 |
+
super().__init__(rsrcmgr)
|
| 131 |
+
self.vfont = vfont
|
| 132 |
+
self.vchar = vchar
|
| 133 |
+
self.thread = thread
|
| 134 |
+
self.layout = layout
|
| 135 |
+
self.resfont = resfont
|
| 136 |
+
self.noto = noto
|
| 137 |
+
self.translator: BaseTranslator = None
|
| 138 |
+
param = service.split(":", 1)
|
| 139 |
+
if param[0] == "google":
|
| 140 |
+
self.translator = GoogleTranslator(service, lang_out, lang_in, None)
|
| 141 |
+
elif param[0] == "deepl":
|
| 142 |
+
self.translator = DeepLTranslator(service, lang_out, lang_in, None)
|
| 143 |
+
elif param[0] == "deeplx":
|
| 144 |
+
self.translator = DeepLXTranslator(service, lang_out, lang_in, None)
|
| 145 |
+
elif param[0] == "ollama":
|
| 146 |
+
self.translator = OllamaTranslator(service, lang_out, lang_in, param[1])
|
| 147 |
+
elif param[0] == "openai":
|
| 148 |
+
self.translator = OpenAITranslator(service, lang_out, lang_in, param[1])
|
| 149 |
+
elif param[0] == "azure":
|
| 150 |
+
self.translator = AzureTranslator(service, lang_out, lang_in, None)
|
| 151 |
+
elif param[0] == "tencent":
|
| 152 |
+
self.translator = TencentTranslator(service, lang_out, lang_in, None)
|
| 153 |
+
else:
|
| 154 |
+
raise ValueError("Unsupported translation service")
|
| 155 |
+
|
| 156 |
+
def receive_layout(self, ltpage: LTPage):
|
| 157 |
+
# 段落
|
| 158 |
+
sstk: list[str] = [] # 段落文字栈
|
| 159 |
+
pstk: list[Paragraph] = [] # 段落属性栈
|
| 160 |
+
vbkt: int = 0 # 段落公式括号计数
|
| 161 |
+
# 公式组
|
| 162 |
+
vstk: list[LTChar] = [] # 公式符号组
|
| 163 |
+
vlstk: list[LTLine] = [] # 公式线条组
|
| 164 |
+
vfix: float = 0 # 公式纵向偏移
|
| 165 |
+
# 公式组栈
|
| 166 |
+
var: list[list[LTChar]] = [] # 公式符号组栈
|
| 167 |
+
varl: list[list[LTLine]] = [] # 公式线条组栈
|
| 168 |
+
varf: list[float] = [] # 公式纵向偏移栈
|
| 169 |
+
vlen: list[float] = [] # 公式宽度栈
|
| 170 |
+
# 全局
|
| 171 |
+
lstk: list[LTLine] = [] # 全局线条栈
|
| 172 |
+
xt: LTChar = None # 上一个字符
|
| 173 |
+
xt_cls: int = -1 # 上一个字符所属段落
|
| 174 |
+
vmax: float = ltpage.width / 4 # 行内公式最大宽度
|
| 175 |
+
ops: str = "" # 渲染结果
|
| 176 |
+
|
| 177 |
+
def vflag(font: str, char: str): # 匹配公式(和角标)字体
|
| 178 |
+
font = font.split("+")[-1] # 字体名截断
|
| 179 |
+
if re.match(r"\(cid:", char):
|
| 180 |
+
return True
|
| 181 |
+
# 基于字体名规则的判定
|
| 182 |
+
if self.vfont:
|
| 183 |
+
if re.match(self.vfont, font):
|
| 184 |
+
return True
|
| 185 |
+
else:
|
| 186 |
+
if re.match( # latex 字体
|
| 187 |
+
r"(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)",
|
| 188 |
+
font,
|
| 189 |
+
):
|
| 190 |
+
return True
|
| 191 |
+
# 基于字符集规则的判定
|
| 192 |
+
if self.vchar:
|
| 193 |
+
if re.match(self.vchar, char):
|
| 194 |
+
return True
|
| 195 |
+
else:
|
| 196 |
+
if (
|
| 197 |
+
char
|
| 198 |
+
and char != " " # 非空格
|
| 199 |
+
and (
|
| 200 |
+
unicodedata.category(char[0])
|
| 201 |
+
in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] # 文字修饰符、数学符号、分隔符号
|
| 202 |
+
or ord(char[0]) in range(0x370, 0x400) # 希腊字母
|
| 203 |
+
)
|
| 204 |
+
):
|
| 205 |
+
return True
|
| 206 |
+
return False
|
| 207 |
+
|
| 208 |
+
############################################################
|
| 209 |
+
# A. 原文档解析
|
| 210 |
+
for child in ltpage:
|
| 211 |
+
if isinstance(child, LTChar):
|
| 212 |
+
cur_v = False
|
| 213 |
+
layout = self.layout[ltpage.pageid]
|
| 214 |
+
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
| 215 |
+
h, w = layout.shape
|
| 216 |
+
# 读取当前字符在 layout 中的类别
|
| 217 |
+
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
|
| 218 |
+
cls = layout[cy, cx]
|
| 219 |
+
if ( # 判定当前字符是否属于公式
|
| 220 |
+
cls == 0 # 1. 类别为保留区域
|
| 221 |
+
or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
|
| 222 |
+
or vflag(child.fontname, child.get_text()) # 3. 公式字体
|
| 223 |
+
or (child.matrix[0] == 0 and child.matrix[3] == 0) # 4. 垂直字体
|
| 224 |
+
):
|
| 225 |
+
cur_v = True
|
| 226 |
+
# 判定括号组是否属于公式
|
| 227 |
+
if not cur_v:
|
| 228 |
+
if vstk and child.get_text() == "(":
|
| 229 |
+
cur_v = True
|
| 230 |
+
vbkt += 1
|
| 231 |
+
if vbkt and child.get_text() == ")":
|
| 232 |
+
cur_v = True
|
| 233 |
+
vbkt -= 1
|
| 234 |
+
if ( # 判定当前公式是否结束
|
| 235 |
+
not cur_v # 1. 当前字符不属于公式
|
| 236 |
+
or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落
|
| 237 |
+
or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分
|
| 238 |
+
):
|
| 239 |
+
if vstk:
|
| 240 |
+
if ( # 根据公式右侧的文字修正公式的纵向偏移
|
| 241 |
+
not cur_v # 1. 当前字符不属于公式
|
| 242 |
+
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
|
| 243 |
+
and child.x0 > max([vch.x0 for vch in vstk]) # 3. 当前字符在公式右侧
|
| 244 |
+
):
|
| 245 |
+
vfix = vstk[0].y0 - child.y0
|
| 246 |
+
sstk[-1] += f"$v{len(var)}$"
|
| 247 |
+
var.append(vstk)
|
| 248 |
+
varl.append(vlstk)
|
| 249 |
+
varf.append(vfix)
|
| 250 |
+
vstk = []
|
| 251 |
+
vlstk = []
|
| 252 |
+
vfix = 0
|
| 253 |
+
# 当前字符不属于公式或当前字符是公式的第一个字符
|
| 254 |
+
if not vstk:
|
| 255 |
+
if cls == xt_cls: # 当前字符与前一个字符属于同一段落
|
| 256 |
+
if child.x0 > xt.x1 + 1: # 添加行内空格
|
| 257 |
+
sstk[-1] += " "
|
| 258 |
+
elif child.x1 < xt.x0: # 添加换行空格并标记原文段落存在换行
|
| 259 |
+
sstk[-1] += " "
|
| 260 |
+
pstk[-1].brk = True
|
| 261 |
+
else: # 根据当前字符构建一个新的段落
|
| 262 |
+
sstk.append("")
|
| 263 |
+
pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, child.font, False))
|
| 264 |
+
if not cur_v: # 文字入栈
|
| 265 |
+
if ( # 根据当前字符修正段落属性
|
| 266 |
+
child.size > pstk[-1].size / 0.79 # 1. 当前字符显著比段落字体大
|
| 267 |
+
or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况)
|
| 268 |
+
or vflag(pstk[-1].font.fontname, "") # 3. 段落字体为公式字体
|
| 269 |
+
or re.match( # 4. 段落字体为粗体
|
| 270 |
+
r"(.*Medi|.*Bold)",
|
| 271 |
+
pstk[-1].font.fontname,
|
| 272 |
+
re.IGNORECASE,
|
| 273 |
+
)
|
| 274 |
+
):
|
| 275 |
+
pstk[-1].y -= child.size - pstk[-1].size # hack 这个段落纵向位置的修正有问题,不过先凑合用吧
|
| 276 |
+
pstk[-1].size = child.size
|
| 277 |
+
pstk[-1].font = child.font
|
| 278 |
+
sstk[-1] += child.get_text()
|
| 279 |
+
else: # 公式入栈
|
| 280 |
+
if ( # 根据公式左侧的文字修正公式的纵向偏移
|
| 281 |
+
not vstk # 1. 当前字符是公式的第一个字符
|
| 282 |
+
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
|
| 283 |
+
and child.x0 > xt.x0 # 3. 前一个字符在公式左侧
|
| 284 |
+
):
|
| 285 |
+
vfix = child.y0 - xt.y0
|
| 286 |
+
vstk.append(child)
|
| 287 |
+
# 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
|
| 288 |
+
pstk[-1].x0 = min(pstk[-1].x0, child.x0)
|
| 289 |
+
pstk[-1].x1 = max(pstk[-1].x1, child.x1)
|
| 290 |
+
# 更新上一个字符
|
| 291 |
+
xt = child
|
| 292 |
+
xt_cls = cls
|
| 293 |
+
elif isinstance(child, LTFigure): # 图表
|
| 294 |
+
pass
|
| 295 |
+
elif isinstance(child, LTLine): # 线条
|
| 296 |
+
layout = self.layout[ltpage.pageid]
|
| 297 |
+
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
| 298 |
+
h, w = layout.shape
|
| 299 |
+
# 读取当前线条在 layout 中的类别
|
| 300 |
+
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
|
| 301 |
+
cls = layout[cy, cx]
|
| 302 |
+
if vstk and cls == xt_cls: # 公式线条
|
| 303 |
+
vlstk.append(child)
|
| 304 |
+
else: # 全局线条
|
| 305 |
+
lstk.append(child)
|
| 306 |
+
else:
|
| 307 |
+
pass
|
| 308 |
+
# 处理结尾
|
| 309 |
+
if vstk: # 公式出栈
|
| 310 |
+
sstk[-1] += f"$v{len(var)}$"
|
| 311 |
+
var.append(vstk)
|
| 312 |
+
varl.append(vlstk)
|
| 313 |
+
varf.append(vfix)
|
| 314 |
+
log.debug("\n==========[VSTACK]==========\n")
|
| 315 |
+
for id, v in enumerate(var): # 计算公式宽度
|
| 316 |
+
l = max([vch.x1 for vch in v]) - v[0].x0
|
| 317 |
+
log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
|
| 318 |
+
vlen.append(l)
|
| 319 |
+
|
| 320 |
+
############################################################
|
| 321 |
+
# B. 段落翻译
|
| 322 |
+
log.debug("\n==========[SSTACK]==========\n")
|
| 323 |
+
hash_key = cache.deterministic_hash("PDFMathTranslate")
|
| 324 |
+
cache.create_cache(hash_key)
|
| 325 |
+
|
| 326 |
+
@retry(wait=wait_fixed(1))
|
| 327 |
+
def worker(s: str): # 多线程翻译
|
| 328 |
+
try:
|
| 329 |
+
hash_key_paragraph = cache.deterministic_hash(
|
| 330 |
+
(s, str(self.translator))
|
| 331 |
+
)
|
| 332 |
+
new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
|
| 333 |
+
if new is None:
|
| 334 |
+
new = self.translator.translate(s)
|
| 335 |
+
cache.write_paragraph(hash_key, hash_key_paragraph, new)
|
| 336 |
+
return new
|
| 337 |
+
except BaseException as e:
|
| 338 |
+
if log.isEnabledFor(logging.DEBUG):
|
| 339 |
+
log.exception(e)
|
| 340 |
+
else:
|
| 341 |
+
log.exception(e, exc_info=False)
|
| 342 |
+
raise e
|
| 343 |
+
with concurrent.futures.ThreadPoolExecutor(
|
| 344 |
+
max_workers=self.thread
|
| 345 |
+
) as executor:
|
| 346 |
+
news = list(executor.map(worker, sstk))
|
| 347 |
+
|
| 348 |
+
############################################################
|
| 349 |
+
# C. 新文档排版
|
| 350 |
+
def raw_string(fcur: str, cstk: str): # 编码字符串
|
| 351 |
+
if fcur == 'noto':
|
| 352 |
+
return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
|
| 353 |
+
elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
|
| 354 |
+
return "".join(["%04x" % ord(c) for c in cstk])
|
| 355 |
+
else:
|
| 356 |
+
return "".join(["%02x" % ord(c) for c in cstk])
|
| 357 |
+
|
| 358 |
+
_x, _y = 0, 0
|
| 359 |
+
for id, new in enumerate(news):
|
| 360 |
+
x: float = pstk[id].x # 段落初始横坐标
|
| 361 |
+
y: float = pstk[id].y # 段落上边界
|
| 362 |
+
x0: float = pstk[id].x0 # 段落左边界
|
| 363 |
+
x1: float = pstk[id].x1 # 段落右边界
|
| 364 |
+
size: float = pstk[id].size # 段落字体大小
|
| 365 |
+
font: PDFFont = pstk[id].font # 段落字体
|
| 366 |
+
brk: bool = pstk[id].brk # 段落属性
|
| 367 |
+
cstk: str = "" # 当前文字栈
|
| 368 |
+
fcur: str = None # 当前字体ID
|
| 369 |
+
tx = x
|
| 370 |
+
fcur_ = fcur
|
| 371 |
+
ptr = 0
|
| 372 |
+
log.debug(f"< {y} {x} {x0} {x1} {size} {font.fontname} {brk} > {sstk[id]} | {new}")
|
| 373 |
+
while ptr < len(new):
|
| 374 |
+
vy_regex = re.match(
|
| 375 |
+
r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
|
| 376 |
+
) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
|
| 377 |
+
mod = 0 # 文字修饰符
|
| 378 |
+
if vy_regex: # 加载公式
|
| 379 |
+
ptr += len(vy_regex.group(0))
|
| 380 |
+
try:
|
| 381 |
+
vid = int(vy_regex.group(1).replace(" ", ""))
|
| 382 |
+
adv = vlen[vid]
|
| 383 |
+
except Exception:
|
| 384 |
+
continue # 翻译器可能会自动补个越界的公式标记
|
| 385 |
+
if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]: # 文字修饰符
|
| 386 |
+
mod = var[vid][-1].width
|
| 387 |
+
else: # 加载文字
|
| 388 |
+
ch = new[ptr]
|
| 389 |
+
fcur_ = None
|
| 390 |
+
# 原字体编码容易出问题,这里直接放弃掉
|
| 391 |
+
# try:
|
| 392 |
+
# if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
|
| 393 |
+
# fcur_=self.fontid[font] # 原字体
|
| 394 |
+
# except:
|
| 395 |
+
# pass
|
| 396 |
+
try:
|
| 397 |
+
if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
|
| 398 |
+
fcur_ = "tiro" # 默认拉丁字体
|
| 399 |
+
except Exception:
|
| 400 |
+
pass
|
| 401 |
+
if fcur_ is None:
|
| 402 |
+
fcur_ = self.resfont # 默认非拉丁字体
|
| 403 |
+
# print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
|
| 404 |
+
if fcur_ == 'noto':
|
| 405 |
+
adv = self.noto.char_lengths(ch, size)[0]
|
| 406 |
+
else:
|
| 407 |
+
adv = self.fontmap[fcur_].char_width(ord(ch)) * size
|
| 408 |
+
ptr += 1
|
| 409 |
+
if ( # 输出文字缓冲区
|
| 410 |
+
fcur_ != fcur # 1. 字体更新
|
| 411 |
+
or vy_regex # 2. 插入公式
|
| 412 |
+
or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
|
| 413 |
+
):
|
| 414 |
+
if cstk:
|
| 415 |
+
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
| 416 |
+
cstk = ""
|
| 417 |
+
if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行
|
| 418 |
+
x = x0
|
| 419 |
+
lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
|
| 420 |
+
y -= size * lang_space.get(self.translator.lang_out, 1.1) # 小语种大多适配 1.1
|
| 421 |
+
if vy_regex: # 插入公式
|
| 422 |
+
fix = 0
|
| 423 |
+
if fcur is not None: # 段落内公式修正纵向偏移
|
| 424 |
+
fix = varf[vid]
|
| 425 |
+
for vch in var[vid]: # 排版公式字符
|
| 426 |
+
vc = chr(vch.cid)
|
| 427 |
+
ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "
|
| 428 |
+
if log.isEnabledFor(logging.DEBUG):
|
| 429 |
+
lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
|
| 430 |
+
_x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
|
| 431 |
+
for l in varl[vid]: # 排版公式线条
|
| 432 |
+
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
| 433 |
+
ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
|
| 434 |
+
else: # 插入文字缓冲区
|
| 435 |
+
if not cstk: # 单行开头
|
| 436 |
+
tx = x
|
| 437 |
+
if x == x0 and ch == " ": # 消除段落换行空格
|
| 438 |
+
adv = 0
|
| 439 |
+
else:
|
| 440 |
+
cstk += ch
|
| 441 |
+
else:
|
| 442 |
+
cstk += ch
|
| 443 |
+
adv -= mod # 文字修饰符
|
| 444 |
+
fcur = fcur_
|
| 445 |
+
x += adv
|
| 446 |
+
if log.isEnabledFor(logging.DEBUG):
|
| 447 |
+
lstk.append(LTLine(0.1, (_x, _y), (x, y)))
|
| 448 |
+
_x, _y = x, y
|
| 449 |
+
# 处理结尾
|
| 450 |
+
if cstk:
|
| 451 |
+
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
| 452 |
+
for l in lstk: # 排版全局线条
|
| 453 |
+
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
| 454 |
+
ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
|
| 455 |
+
ops = f"BT {ops}ET "
|
| 456 |
+
return ops
|
pdf2zh/doclayout.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import abc
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import ast
|
| 5 |
+
import onnx
|
| 6 |
+
import onnxruntime
|
| 7 |
+
from huggingface_hub import hf_hub_download
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class DocLayoutModel(abc.ABC):
|
| 11 |
+
@staticmethod
|
| 12 |
+
def load_onnx():
|
| 13 |
+
model = OnnxModel.from_pretrained(
|
| 14 |
+
repo_id="wybxc/DocLayout-YOLO-DocStructBench-onnx",
|
| 15 |
+
filename="doclayout_yolo_docstructbench_imgsz1024.onnx",
|
| 16 |
+
)
|
| 17 |
+
return model
|
| 18 |
+
|
| 19 |
+
@staticmethod
|
| 20 |
+
def load_available():
|
| 21 |
+
return DocLayoutModel.load_onnx()
|
| 22 |
+
|
| 23 |
+
@property
|
| 24 |
+
@abc.abstractmethod
|
| 25 |
+
def stride(self) -> int:
|
| 26 |
+
"""Stride of the model input."""
|
| 27 |
+
pass
|
| 28 |
+
|
| 29 |
+
@abc.abstractmethod
|
| 30 |
+
def predict(self, image, imgsz=1024, **kwargs) -> list:
|
| 31 |
+
"""
|
| 32 |
+
Predict the layout of a document page.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
image: The image of the document page.
|
| 36 |
+
imgsz: Resize the image to this size. Must be a multiple of the stride.
|
| 37 |
+
**kwargs: Additional arguments.
|
| 38 |
+
"""
|
| 39 |
+
pass
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class YoloResult:
|
| 43 |
+
"""Helper class to store detection results from ONNX model."""
|
| 44 |
+
|
| 45 |
+
def __init__(self, boxes, names):
|
| 46 |
+
self.boxes = [YoloBox(data=d) for d in boxes]
|
| 47 |
+
self.boxes.sort(key=lambda x: x.conf, reverse=True)
|
| 48 |
+
self.names = names
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class YoloBox:
|
| 52 |
+
"""Helper class to store detection results from ONNX model."""
|
| 53 |
+
|
| 54 |
+
def __init__(self, data):
|
| 55 |
+
self.xyxy = data[:4]
|
| 56 |
+
self.conf = data[-2]
|
| 57 |
+
self.cls = data[-1]
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class OnnxModel(DocLayoutModel):
|
| 61 |
+
def __init__(self, model_path: str):
|
| 62 |
+
self.model_path = model_path
|
| 63 |
+
|
| 64 |
+
model = onnx.load(model_path)
|
| 65 |
+
metadata = {d.key: d.value for d in model.metadata_props}
|
| 66 |
+
self._stride = ast.literal_eval(metadata["stride"])
|
| 67 |
+
self._names = ast.literal_eval(metadata["names"])
|
| 68 |
+
|
| 69 |
+
self.model = onnxruntime.InferenceSession(model.SerializeToString())
|
| 70 |
+
|
| 71 |
+
@staticmethod
|
| 72 |
+
def from_pretrained(repo_id: str, filename: str):
|
| 73 |
+
pth = hf_hub_download(repo_id=repo_id, filename=filename)
|
| 74 |
+
return OnnxModel(pth)
|
| 75 |
+
|
| 76 |
+
@property
|
| 77 |
+
def stride(self):
|
| 78 |
+
return self._stride
|
| 79 |
+
|
| 80 |
+
def resize_and_pad_image(self, image, new_shape):
|
| 81 |
+
"""
|
| 82 |
+
Resize and pad the image to the specified size, ensuring dimensions are multiples of stride.
|
| 83 |
+
|
| 84 |
+
Parameters:
|
| 85 |
+
- image: Input image
|
| 86 |
+
- new_shape: Target size (integer or (height, width) tuple)
|
| 87 |
+
- stride: Padding alignment stride, default 32
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
- Processed image
|
| 91 |
+
"""
|
| 92 |
+
if isinstance(new_shape, int):
|
| 93 |
+
new_shape = (new_shape, new_shape)
|
| 94 |
+
|
| 95 |
+
h, w = image.shape[:2]
|
| 96 |
+
new_h, new_w = new_shape
|
| 97 |
+
|
| 98 |
+
# Calculate scaling ratio
|
| 99 |
+
r = min(new_h / h, new_w / w)
|
| 100 |
+
resized_h, resized_w = int(round(h * r)), int(round(w * r))
|
| 101 |
+
|
| 102 |
+
# Resize image
|
| 103 |
+
image = cv2.resize(
|
| 104 |
+
image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# Calculate padding size and align to stride multiple
|
| 108 |
+
pad_w = (new_w - resized_w) % self.stride
|
| 109 |
+
pad_h = (new_h - resized_h) % self.stride
|
| 110 |
+
top, bottom = pad_h // 2, pad_h - pad_h // 2
|
| 111 |
+
left, right = pad_w // 2, pad_w - pad_w // 2
|
| 112 |
+
|
| 113 |
+
# Add padding
|
| 114 |
+
image = cv2.copyMakeBorder(
|
| 115 |
+
image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
return image
|
| 119 |
+
|
| 120 |
+
def scale_boxes(self, img1_shape, boxes, img0_shape):
|
| 121 |
+
"""
|
| 122 |
+
Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
|
| 123 |
+
specified in (img1_shape) to the shape of a different image (img0_shape).
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
img1_shape (tuple): The shape of the image that the bounding boxes are for,
|
| 127 |
+
in the format of (height, width).
|
| 128 |
+
boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
|
| 129 |
+
img0_shape (tuple): the shape of the target image, in the format of (height, width).
|
| 130 |
+
|
| 131 |
+
Returns:
|
| 132 |
+
boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
|
| 133 |
+
"""
|
| 134 |
+
|
| 135 |
+
# Calculate scaling ratio
|
| 136 |
+
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])
|
| 137 |
+
|
| 138 |
+
# Calculate padding size
|
| 139 |
+
pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
|
| 140 |
+
pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)
|
| 141 |
+
|
| 142 |
+
# Remove padding and scale boxes
|
| 143 |
+
boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain
|
| 144 |
+
return boxes
|
| 145 |
+
|
| 146 |
+
def predict(self, image, imgsz=1024, **kwargs):
|
| 147 |
+
# Preprocess input image
|
| 148 |
+
orig_h, orig_w = image.shape[:2]
|
| 149 |
+
pix = self.resize_and_pad_image(image, new_shape=imgsz)
|
| 150 |
+
pix = np.transpose(pix, (2, 0, 1)) # CHW
|
| 151 |
+
pix = np.expand_dims(pix, axis=0) # BCHW
|
| 152 |
+
pix = pix.astype(np.float32) / 255.0 # Normalize to [0, 1]
|
| 153 |
+
new_h, new_w = pix.shape[2:]
|
| 154 |
+
|
| 155 |
+
# Run inference
|
| 156 |
+
preds = self.model.run(None, {"images": pix})[0]
|
| 157 |
+
|
| 158 |
+
# Postprocess predictions
|
| 159 |
+
preds = preds[preds[..., 4] > 0.25]
|
| 160 |
+
preds[..., :4] = self.scale_boxes(
|
| 161 |
+
(new_h, new_w), preds[..., :4], (orig_h, orig_w)
|
| 162 |
+
)
|
| 163 |
+
return [YoloResult(boxes=preds, names=self._names)]
|
pdf2zh/gui.py
ADDED
|
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from pdf2zh import __version__
|
| 5 |
+
from pdf2zh.pdf2zh import extract_text
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pymupdf
|
| 10 |
+
import tqdm
|
| 11 |
+
import requests
|
| 12 |
+
import cgi
|
| 13 |
+
|
| 14 |
+
# Map service names to pdf2zh service options
|
| 15 |
+
service_map = {
|
| 16 |
+
"Google": ("google", None, None),
|
| 17 |
+
"DeepL": ("deepl", "DEEPL_AUTH_KEY", None),
|
| 18 |
+
"DeepLX": ("deeplx", "DEEPLX_AUTH_KEY", None),
|
| 19 |
+
"Ollama": ("ollama", None, "gemma2"),
|
| 20 |
+
"OpenAI": ("openai", "OPENAI_API_KEY", "gpt-4o"),
|
| 21 |
+
"Azure": ("azure", "AZURE_APIKEY", None),
|
| 22 |
+
"Tencent": ("tencent", "TENCENT_SECRET_KEY", None),
|
| 23 |
+
}
|
| 24 |
+
lang_map = {
|
| 25 |
+
"Chinese": "zh",
|
| 26 |
+
"English": "en",
|
| 27 |
+
"French": "fr",
|
| 28 |
+
"German": "de",
|
| 29 |
+
"Japanese": "ja",
|
| 30 |
+
"Korean": "ko",
|
| 31 |
+
"Russian": "ru",
|
| 32 |
+
"Spanish": "es",
|
| 33 |
+
"Italian": "it",
|
| 34 |
+
}
|
| 35 |
+
page_map = {
|
| 36 |
+
"All": None,
|
| 37 |
+
"First": [0],
|
| 38 |
+
"First 5 pages": list(range(0, 5)),
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
flag_demo = False
|
| 42 |
+
if os.environ.get("PDF2ZH_DEMO"):
|
| 43 |
+
flag_demo = True
|
| 44 |
+
service_map = {
|
| 45 |
+
"Google": ("google", None, None),
|
| 46 |
+
}
|
| 47 |
+
page_map = {
|
| 48 |
+
"First": [0],
|
| 49 |
+
"First 20 pages": list(range(0, 20)),
|
| 50 |
+
}
|
| 51 |
+
client_key = os.environ.get("PDF2ZH_CLIENT_KEY")
|
| 52 |
+
server_key = os.environ.get("PDF2ZH_SERVER_KEY")
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def verify_recaptcha(response):
|
| 56 |
+
recaptcha_url = "https://www.google.com/recaptcha/api/siteverify"
|
| 57 |
+
|
| 58 |
+
print("reCAPTCHA", server_key, response)
|
| 59 |
+
|
| 60 |
+
data = {"secret": server_key, "response": response}
|
| 61 |
+
result = requests.post(recaptcha_url, data=data).json()
|
| 62 |
+
|
| 63 |
+
print("reCAPTCHA", result.get("success"))
|
| 64 |
+
|
| 65 |
+
return result.get("success")
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def pdf_preview(file):
|
| 69 |
+
doc = pymupdf.open(file)
|
| 70 |
+
page = doc[0]
|
| 71 |
+
pix = page.get_pixmap()
|
| 72 |
+
image = np.frombuffer(pix.samples, np.uint8).reshape(pix.height, pix.width, 3)
|
| 73 |
+
return image
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def upload_file(file, service, progress=gr.Progress()):
|
| 77 |
+
"""Handle file upload, validation, and initial preview."""
|
| 78 |
+
if not file or not os.path.exists(file):
|
| 79 |
+
return None, None
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
# Convert first page for preview
|
| 83 |
+
preview_image = pdf_preview(file)
|
| 84 |
+
|
| 85 |
+
return file, preview_image
|
| 86 |
+
except Exception as e:
|
| 87 |
+
print(f"Error converting PDF: {e}")
|
| 88 |
+
return None, None
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def download_with_limit(url, save_path, size_limit):
|
| 92 |
+
chunk_size = 1024
|
| 93 |
+
total_size = 0
|
| 94 |
+
with requests.get(url, stream=True, timeout=10) as response:
|
| 95 |
+
response.raise_for_status()
|
| 96 |
+
content = response.headers.get("Content-Disposition")
|
| 97 |
+
try:
|
| 98 |
+
_, params = cgi.parse_header(content)
|
| 99 |
+
filename = params["filename"]
|
| 100 |
+
except Exception:
|
| 101 |
+
filename = os.path.basename(url)
|
| 102 |
+
with open(save_path / filename, "wb") as file:
|
| 103 |
+
for chunk in response.iter_content(chunk_size=chunk_size):
|
| 104 |
+
total_size += len(chunk)
|
| 105 |
+
if size_limit and total_size > size_limit:
|
| 106 |
+
raise gr.Error("Exceeds file size limit")
|
| 107 |
+
file.write(chunk)
|
| 108 |
+
return save_path / filename
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def translate(
|
| 112 |
+
file_type,
|
| 113 |
+
file_input,
|
| 114 |
+
link_input,
|
| 115 |
+
service,
|
| 116 |
+
apikey,
|
| 117 |
+
model_id,
|
| 118 |
+
lang_from,
|
| 119 |
+
lang_to,
|
| 120 |
+
page_range,
|
| 121 |
+
recaptcha_response,
|
| 122 |
+
progress=gr.Progress(),
|
| 123 |
+
):
|
| 124 |
+
"""Translate PDF content using selected service."""
|
| 125 |
+
if flag_demo and not verify_recaptcha(recaptcha_response):
|
| 126 |
+
raise gr.Error("reCAPTCHA fail")
|
| 127 |
+
|
| 128 |
+
progress(0, desc="Starting translation...")
|
| 129 |
+
|
| 130 |
+
output = Path("pdf2zh_files")
|
| 131 |
+
output.mkdir(parents=True, exist_ok=True)
|
| 132 |
+
|
| 133 |
+
if file_type == "File":
|
| 134 |
+
if not file_input:
|
| 135 |
+
raise gr.Error("No input")
|
| 136 |
+
file_path = shutil.copy(file_input, output)
|
| 137 |
+
else:
|
| 138 |
+
if not link_input:
|
| 139 |
+
raise gr.Error("No input")
|
| 140 |
+
file_path = download_with_limit(
|
| 141 |
+
link_input,
|
| 142 |
+
output,
|
| 143 |
+
5 * 1024 * 1024 if flag_demo else None,
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
filename = os.path.splitext(os.path.basename(file_path))[0]
|
| 147 |
+
file_en = output / f"{filename}.pdf"
|
| 148 |
+
file_zh = output / f"{filename}-zh.pdf"
|
| 149 |
+
file_dual = output / f"{filename}-dual.pdf"
|
| 150 |
+
|
| 151 |
+
selected_service = service_map[service][0]
|
| 152 |
+
if service_map[service][1]:
|
| 153 |
+
os.environ.setdefault(service_map[service][1], apikey)
|
| 154 |
+
selected_page = page_map[page_range]
|
| 155 |
+
lang_from = lang_map[lang_from]
|
| 156 |
+
lang_to = lang_map[lang_to]
|
| 157 |
+
if selected_service == "google":
|
| 158 |
+
lang_from = "zh-CN" if lang_from == "zh" else lang_from
|
| 159 |
+
lang_to = "zh-CN" if lang_to == "zh" else lang_to
|
| 160 |
+
|
| 161 |
+
print(f"Files before translation: {os.listdir(output)}")
|
| 162 |
+
|
| 163 |
+
def progress_bar(t: tqdm.tqdm):
|
| 164 |
+
progress(t.n / t.total, desc="Translating...")
|
| 165 |
+
|
| 166 |
+
param = {
|
| 167 |
+
"files": [file_en],
|
| 168 |
+
"pages": selected_page,
|
| 169 |
+
"lang_in": lang_from,
|
| 170 |
+
"lang_out": lang_to,
|
| 171 |
+
"service": f"{selected_service}:{model_id}",
|
| 172 |
+
"output": output,
|
| 173 |
+
"thread": 4,
|
| 174 |
+
"callback": progress_bar,
|
| 175 |
+
}
|
| 176 |
+
print(param)
|
| 177 |
+
extract_text(**param)
|
| 178 |
+
print(f"Files after translation: {os.listdir(output)}")
|
| 179 |
+
|
| 180 |
+
if not file_zh.exists() or not file_dual.exists():
|
| 181 |
+
raise gr.Error("No output")
|
| 182 |
+
|
| 183 |
+
try:
|
| 184 |
+
translated_preview = pdf_preview(str(file_zh))
|
| 185 |
+
except Exception:
|
| 186 |
+
raise gr.Error("No preview")
|
| 187 |
+
|
| 188 |
+
progress(1.0, desc="Translation complete!")
|
| 189 |
+
|
| 190 |
+
return (
|
| 191 |
+
str(file_zh),
|
| 192 |
+
translated_preview,
|
| 193 |
+
str(file_dual),
|
| 194 |
+
gr.update(visible=True),
|
| 195 |
+
gr.update(visible=True),
|
| 196 |
+
gr.update(visible=True),
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
# Global setup
|
| 201 |
+
custom_blue = gr.themes.Color(
|
| 202 |
+
c50="#E8F3FF",
|
| 203 |
+
c100="#BEDAFF",
|
| 204 |
+
c200="#94BFFF",
|
| 205 |
+
c300="#6AA1FF",
|
| 206 |
+
c400="#4080FF",
|
| 207 |
+
c500="#165DFF", # Primary color
|
| 208 |
+
c600="#0E42D2",
|
| 209 |
+
c700="#0A2BA6",
|
| 210 |
+
c800="#061D79",
|
| 211 |
+
c900="#03114D",
|
| 212 |
+
c950="#020B33",
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
with gr.Blocks(
|
| 216 |
+
title="PDFMathTranslate - PDF Translation with preserved formats",
|
| 217 |
+
theme=gr.themes.Default(
|
| 218 |
+
primary_hue=custom_blue, spacing_size="md", radius_size="lg"
|
| 219 |
+
),
|
| 220 |
+
css="""
|
| 221 |
+
.secondary-text {color: #999 !important;}
|
| 222 |
+
footer {visibility: hidden}
|
| 223 |
+
.env-warning {color: #dd5500 !important;}
|
| 224 |
+
.env-success {color: #559900 !important;}
|
| 225 |
+
|
| 226 |
+
/* Add dashed border to input-file class */
|
| 227 |
+
.input-file {
|
| 228 |
+
border: 1.2px dashed #165DFF !important;
|
| 229 |
+
border-radius: 6px !important;
|
| 230 |
+
# background-color: #ffffff !important;
|
| 231 |
+
transition: background-color 0.4s ease-out;
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
.input-file:hover {
|
| 235 |
+
border: 1.2px dashed #165DFF !important;
|
| 236 |
+
border-radius: 6px !important;
|
| 237 |
+
color: #165DFF !important;
|
| 238 |
+
background-color: #E8F3FF !important;
|
| 239 |
+
transition: background-color 0.2s ease-in;
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
.progress-bar-wrap {
|
| 243 |
+
border-radius: 8px !important;
|
| 244 |
+
}
|
| 245 |
+
.progress-bar {
|
| 246 |
+
border-radius: 8px !important;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
# .input-file label {
|
| 250 |
+
# color: #165DFF !important;
|
| 251 |
+
# border: 1.2px dashed #165DFF !important;
|
| 252 |
+
# border-left: none !important;
|
| 253 |
+
# border-top: none !important;
|
| 254 |
+
# }
|
| 255 |
+
# .input-file .wrap {
|
| 256 |
+
# color: #165DFF !important;
|
| 257 |
+
# }
|
| 258 |
+
# .input-file .or {
|
| 259 |
+
# color: #165DFF !important;
|
| 260 |
+
# }
|
| 261 |
+
""",
|
| 262 |
+
head=(
|
| 263 |
+
"""
|
| 264 |
+
<script src="https://www.google.com/recaptcha/api.js?render=explicit" async defer></script>
|
| 265 |
+
<script type="text/javascript">
|
| 266 |
+
var onVerify = function(token) {
|
| 267 |
+
el=document.getElementById('verify').getElementsByTagName('textarea')[0];
|
| 268 |
+
el.value=token;
|
| 269 |
+
el.dispatchEvent(new Event('input'));
|
| 270 |
+
};
|
| 271 |
+
</script>
|
| 272 |
+
"""
|
| 273 |
+
if flag_demo
|
| 274 |
+
else ""
|
| 275 |
+
),
|
| 276 |
+
) as demo:
|
| 277 |
+
gr.Markdown(
|
| 278 |
+
"# [PDFMathTranslate @ GitHub](https://github.com/Byaidu/PDFMathTranslate)"
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
with gr.Row():
|
| 282 |
+
with gr.Column(scale=1):
|
| 283 |
+
gr.Markdown("## File | < 5 MB" if flag_demo else "## File")
|
| 284 |
+
file_type = gr.Radio(
|
| 285 |
+
choices=["File", "Link"],
|
| 286 |
+
label="Type",
|
| 287 |
+
value="File",
|
| 288 |
+
)
|
| 289 |
+
file_input = gr.File(
|
| 290 |
+
label="File",
|
| 291 |
+
file_count="single",
|
| 292 |
+
file_types=[".pdf"],
|
| 293 |
+
type="filepath",
|
| 294 |
+
elem_classes=["input-file"],
|
| 295 |
+
)
|
| 296 |
+
link_input = gr.Textbox(
|
| 297 |
+
label="Link",
|
| 298 |
+
visible=False,
|
| 299 |
+
interactive=True,
|
| 300 |
+
)
|
| 301 |
+
gr.Markdown("## Option")
|
| 302 |
+
with gr.Row():
|
| 303 |
+
service = gr.Dropdown(
|
| 304 |
+
label="Service",
|
| 305 |
+
choices=service_map.keys(),
|
| 306 |
+
value="Google",
|
| 307 |
+
)
|
| 308 |
+
apikey = gr.Textbox(
|
| 309 |
+
label="API Key",
|
| 310 |
+
max_lines=1,
|
| 311 |
+
visible=False,
|
| 312 |
+
)
|
| 313 |
+
with gr.Row():
|
| 314 |
+
lang_from = gr.Dropdown(
|
| 315 |
+
label="Translate from",
|
| 316 |
+
choices=lang_map.keys(),
|
| 317 |
+
value="English",
|
| 318 |
+
)
|
| 319 |
+
lang_to = gr.Dropdown(
|
| 320 |
+
label="Translate to",
|
| 321 |
+
choices=lang_map.keys(),
|
| 322 |
+
value="Chinese",
|
| 323 |
+
)
|
| 324 |
+
page_range = gr.Radio(
|
| 325 |
+
choices=page_map.keys(),
|
| 326 |
+
label="Pages",
|
| 327 |
+
value=list(page_map.keys())[0],
|
| 328 |
+
)
|
| 329 |
+
model_id = gr.Textbox(
|
| 330 |
+
label="Model ID",
|
| 331 |
+
visible=False,
|
| 332 |
+
interactive=True,
|
| 333 |
+
)
|
| 334 |
+
envs_status = "<span class='env-success'>- Properly configured.</span><br>"
|
| 335 |
+
|
| 336 |
+
def details_wrapper(text_markdown):
|
| 337 |
+
text = f"""
|
| 338 |
+
<summary>Technical details</summary>
|
| 339 |
+
{text_markdown}
|
| 340 |
+
- GitHub: <a href="https://github.com/Byaidu/PDFMathTranslate">Byaidu/PDFMathTranslate</a><br>
|
| 341 |
+
- GUI by: <a href="https://github.com/reycn">Rongxin</a><br>
|
| 342 |
+
- Version: {__version__}
|
| 343 |
+
"""
|
| 344 |
+
return text
|
| 345 |
+
|
| 346 |
+
def env_var_checker(env_var_name: str) -> str:
|
| 347 |
+
if env_var_name:
|
| 348 |
+
if not os.environ.get(env_var_name):
|
| 349 |
+
envs_status = (
|
| 350 |
+
f"<span class='env-warning'>- Warning: environmental not found or error ({env_var_name})."
|
| 351 |
+
+ "</span><br>- Please make sure that the environment variables are properly configured "
|
| 352 |
+
+ "(<a href='https://github.com/Byaidu/PDFMathTranslate'>guide</a>).<br>"
|
| 353 |
+
)
|
| 354 |
+
else:
|
| 355 |
+
value = str(os.environ.get(env_var_name))
|
| 356 |
+
envs_status = "<span class='env-success'>- Properly configured.</span><br>"
|
| 357 |
+
envs_status += (
|
| 358 |
+
f"- {env_var_name}: <code>{value[:13]}***</code><br>"
|
| 359 |
+
)
|
| 360 |
+
else:
|
| 361 |
+
envs_status = (
|
| 362 |
+
"<span class='env-success'>- Properly configured.</span><br>"
|
| 363 |
+
)
|
| 364 |
+
return details_wrapper(envs_status)
|
| 365 |
+
|
| 366 |
+
def on_select_service(service, evt: gr.EventData):
|
| 367 |
+
if service_map[service][1]:
|
| 368 |
+
apikey_content = gr.update(
|
| 369 |
+
visible=True, value=os.environ.get(service_map[service][1])
|
| 370 |
+
)
|
| 371 |
+
else:
|
| 372 |
+
apikey_content = gr.update(visible=False)
|
| 373 |
+
if service_map[service][2]:
|
| 374 |
+
model_visibility = gr.update(
|
| 375 |
+
visible=True, value=service_map[service][2]
|
| 376 |
+
)
|
| 377 |
+
else:
|
| 378 |
+
model_visibility = gr.update(visible=False)
|
| 379 |
+
return (
|
| 380 |
+
env_var_checker(service_map[service][1]),
|
| 381 |
+
model_visibility,
|
| 382 |
+
apikey_content,
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
def on_select_filetype(file_type):
|
| 386 |
+
return (
|
| 387 |
+
gr.update(visible=file_type == "File"),
|
| 388 |
+
gr.update(visible=file_type == "Link"),
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
output_title = gr.Markdown("## Translated", visible=False)
|
| 392 |
+
output_file = gr.File(label="Download Translation", visible=False)
|
| 393 |
+
output_file_dual = gr.File(
|
| 394 |
+
label="Download Translation (Dual)", visible=False
|
| 395 |
+
)
|
| 396 |
+
recaptcha_response = gr.Textbox(
|
| 397 |
+
label="reCAPTCHA Response", elem_id="verify", visible=False
|
| 398 |
+
)
|
| 399 |
+
recaptcha_box = gr.HTML('<div id="recaptcha-box"></div>')
|
| 400 |
+
translate_btn = gr.Button("Translate", variant="primary")
|
| 401 |
+
tech_details_tog = gr.Markdown(
|
| 402 |
+
details_wrapper(envs_status),
|
| 403 |
+
elem_classes=["secondary-text"],
|
| 404 |
+
)
|
| 405 |
+
service.select(
|
| 406 |
+
on_select_service, service, [tech_details_tog, model_id, apikey]
|
| 407 |
+
)
|
| 408 |
+
file_type.select(
|
| 409 |
+
on_select_filetype,
|
| 410 |
+
file_type,
|
| 411 |
+
[file_input, link_input],
|
| 412 |
+
js=(
|
| 413 |
+
f"""
|
| 414 |
+
(a,b)=>{{
|
| 415 |
+
try{{
|
| 416 |
+
grecaptcha.render('recaptcha-box',{{
|
| 417 |
+
'sitekey':'{client_key}',
|
| 418 |
+
'callback':'onVerify'
|
| 419 |
+
}});
|
| 420 |
+
}}catch(error){{}}
|
| 421 |
+
return [a];
|
| 422 |
+
}}
|
| 423 |
+
"""
|
| 424 |
+
if flag_demo
|
| 425 |
+
else ""
|
| 426 |
+
),
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
with gr.Column(scale=2):
|
| 430 |
+
gr.Markdown("## Preview")
|
| 431 |
+
preview = gr.Image(label="Document Preview", visible=True)
|
| 432 |
+
|
| 433 |
+
# Event handlers
|
| 434 |
+
file_input.upload(
|
| 435 |
+
upload_file,
|
| 436 |
+
inputs=[file_input, service],
|
| 437 |
+
outputs=[file_input, preview],
|
| 438 |
+
js=(
|
| 439 |
+
f"""
|
| 440 |
+
(a,b)=>{{
|
| 441 |
+
try{{
|
| 442 |
+
grecaptcha.render('recaptcha-box',{{
|
| 443 |
+
'sitekey':'{client_key}',
|
| 444 |
+
'callback':'onVerify'
|
| 445 |
+
}});
|
| 446 |
+
}}catch(error){{}}
|
| 447 |
+
return [a];
|
| 448 |
+
}}
|
| 449 |
+
"""
|
| 450 |
+
if flag_demo
|
| 451 |
+
else ""
|
| 452 |
+
),
|
| 453 |
+
)
|
| 454 |
+
|
| 455 |
+
translate_btn.click(
|
| 456 |
+
translate,
|
| 457 |
+
inputs=[
|
| 458 |
+
file_type,
|
| 459 |
+
file_input,
|
| 460 |
+
link_input,
|
| 461 |
+
service,
|
| 462 |
+
apikey,
|
| 463 |
+
model_id,
|
| 464 |
+
lang_from,
|
| 465 |
+
lang_to,
|
| 466 |
+
page_range,
|
| 467 |
+
recaptcha_response,
|
| 468 |
+
],
|
| 469 |
+
outputs=[
|
| 470 |
+
output_file,
|
| 471 |
+
preview,
|
| 472 |
+
output_file_dual,
|
| 473 |
+
output_file,
|
| 474 |
+
output_file_dual,
|
| 475 |
+
output_title,
|
| 476 |
+
],
|
| 477 |
+
).then(lambda: None, js="()=>{grecaptcha.reset()}" if flag_demo else "")
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
def setup_gui(share=False):
|
| 481 |
+
if flag_demo:
|
| 482 |
+
demo.launch(server_name="0.0.0.0", max_file_size="5mb", inbrowser=True)
|
| 483 |
+
else:
|
| 484 |
+
try:
|
| 485 |
+
demo.launch(server_name="0.0.0.0", debug=True, inbrowser=True, share=share)
|
| 486 |
+
except Exception:
|
| 487 |
+
print(
|
| 488 |
+
"Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software."
|
| 489 |
+
)
|
| 490 |
+
try:
|
| 491 |
+
demo.launch(
|
| 492 |
+
server_name="127.0.0.1", debug=True, inbrowser=True, share=share
|
| 493 |
+
)
|
| 494 |
+
except Exception:
|
| 495 |
+
print(
|
| 496 |
+
"Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software."
|
| 497 |
+
)
|
| 498 |
+
demo.launch(debug=True, inbrowser=True, share=True)
|
| 499 |
+
|
| 500 |
+
|
| 501 |
+
# For auto-reloading while developing
|
| 502 |
+
if __name__ == "__main__":
|
| 503 |
+
setup_gui()
|
pdf2zh/high_level.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Functions that can be used for the most common use-cases for pdf2zh.six"""
|
| 2 |
+
|
| 3 |
+
from typing import BinaryIO
|
| 4 |
+
import numpy as np
|
| 5 |
+
import tqdm
|
| 6 |
+
from pymupdf import Document
|
| 7 |
+
from pdfminer.pdfpage import PDFPage
|
| 8 |
+
from pdfminer.pdfinterp import PDFResourceManager
|
| 9 |
+
from pdfminer.pdfdocument import PDFDocument
|
| 10 |
+
from pdfminer.pdfparser import PDFParser
|
| 11 |
+
from pdf2zh.converter import TranslateConverter
|
| 12 |
+
from pdf2zh.pdfinterp import PDFPageInterpreterEx
|
| 13 |
+
from pymupdf import Font
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def extract_text_to_fp(
|
| 17 |
+
inf: BinaryIO,
|
| 18 |
+
pages=None,
|
| 19 |
+
password: str = "",
|
| 20 |
+
debug: bool = False,
|
| 21 |
+
page_count: int = 0,
|
| 22 |
+
vfont: str = "",
|
| 23 |
+
vchar: str = "",
|
| 24 |
+
thread: int = 0,
|
| 25 |
+
doc_en: Document = None,
|
| 26 |
+
model=None,
|
| 27 |
+
lang_in: str = "",
|
| 28 |
+
lang_out: str = "",
|
| 29 |
+
service: str = "",
|
| 30 |
+
resfont: str = "",
|
| 31 |
+
noto: Font = None,
|
| 32 |
+
callback: object = None,
|
| 33 |
+
**kwarg,
|
| 34 |
+
) -> None:
|
| 35 |
+
rsrcmgr = PDFResourceManager()
|
| 36 |
+
layout = {}
|
| 37 |
+
device = TranslateConverter(
|
| 38 |
+
rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service, resfont, noto
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
assert device is not None
|
| 42 |
+
obj_patch = {}
|
| 43 |
+
interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch)
|
| 44 |
+
if pages:
|
| 45 |
+
total_pages = len(pages)
|
| 46 |
+
else:
|
| 47 |
+
total_pages = page_count
|
| 48 |
+
|
| 49 |
+
parser = PDFParser(inf)
|
| 50 |
+
doc = PDFDocument(parser, password=password)
|
| 51 |
+
with tqdm.tqdm(
|
| 52 |
+
enumerate(PDFPage.create_pages(doc)),
|
| 53 |
+
total=total_pages,
|
| 54 |
+
) as progress:
|
| 55 |
+
for pageno, page in progress:
|
| 56 |
+
if pages and (pageno not in pages):
|
| 57 |
+
continue
|
| 58 |
+
if callback:
|
| 59 |
+
callback(progress)
|
| 60 |
+
page.pageno = pageno
|
| 61 |
+
pix = doc_en[page.pageno].get_pixmap()
|
| 62 |
+
image = np.fromstring(pix.samples, np.uint8).reshape(
|
| 63 |
+
pix.height, pix.width, 3
|
| 64 |
+
)[:, :, ::-1]
|
| 65 |
+
page_layout = model.predict(image, imgsz=int(pix.height / 32) * 32)[0]
|
| 66 |
+
# kdtree 是不可能 kdtree 的,不如直接渲染成图片,用空间换时间
|
| 67 |
+
box = np.ones((pix.height, pix.width))
|
| 68 |
+
h, w = box.shape
|
| 69 |
+
vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"]
|
| 70 |
+
for i, d in enumerate(page_layout.boxes):
|
| 71 |
+
if not page_layout.names[int(d.cls)] in vcls:
|
| 72 |
+
x0, y0, x1, y1 = d.xyxy.squeeze()
|
| 73 |
+
x0, y0, x1, y1 = (
|
| 74 |
+
np.clip(int(x0 - 1), 0, w - 1),
|
| 75 |
+
np.clip(int(h - y1 - 1), 0, h - 1),
|
| 76 |
+
np.clip(int(x1 + 1), 0, w - 1),
|
| 77 |
+
np.clip(int(h - y0 + 1), 0, h - 1),
|
| 78 |
+
)
|
| 79 |
+
box[y0:y1, x0:x1] = i + 2
|
| 80 |
+
for i, d in enumerate(page_layout.boxes):
|
| 81 |
+
if page_layout.names[int(d.cls)] in vcls:
|
| 82 |
+
x0, y0, x1, y1 = d.xyxy.squeeze()
|
| 83 |
+
x0, y0, x1, y1 = (
|
| 84 |
+
np.clip(int(x0 - 1), 0, w - 1),
|
| 85 |
+
np.clip(int(h - y1 - 1), 0, h - 1),
|
| 86 |
+
np.clip(int(x1 + 1), 0, w - 1),
|
| 87 |
+
np.clip(int(h - y0 + 1), 0, h - 1),
|
| 88 |
+
)
|
| 89 |
+
box[y0:y1, x0:x1] = 0
|
| 90 |
+
layout[page.pageno] = box
|
| 91 |
+
# 新建一个 xref 存放新指令流
|
| 92 |
+
page.page_xref = doc_en.get_new_xref() # hack 插入页面的新 xref
|
| 93 |
+
doc_en.update_object(page.page_xref, "<<>>")
|
| 94 |
+
doc_en.update_stream(page.page_xref, b"")
|
| 95 |
+
doc_en[page.pageno].set_contents(page.page_xref)
|
| 96 |
+
interpreter.process_page(page)
|
| 97 |
+
|
| 98 |
+
device.close()
|
| 99 |
+
return obj_patch
|
pdf2zh/pdf2zh.py
ADDED
|
@@ -0,0 +1,325 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""A command line tool for extracting text and images from PDF and
|
| 3 |
+
output it to plain text, html, xml or tags.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import argparse
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
import logging
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Any, Container, Iterable, List, Optional
|
| 14 |
+
import urllib.request
|
| 15 |
+
from pdfminer.pdfexceptions import PDFValueError
|
| 16 |
+
|
| 17 |
+
import pymupdf
|
| 18 |
+
import requests
|
| 19 |
+
import tempfile
|
| 20 |
+
|
| 21 |
+
from pdf2zh import __version__, log
|
| 22 |
+
from pdf2zh.high_level import extract_text_to_fp
|
| 23 |
+
from pdf2zh.doclayout import DocLayoutModel
|
| 24 |
+
|
| 25 |
+
logging.basicConfig()
|
| 26 |
+
|
| 27 |
+
model = DocLayoutModel.load_available()
|
| 28 |
+
|
| 29 |
+
resfont_map = {
|
| 30 |
+
"zh-CN": "china-ss",
|
| 31 |
+
"zh-TW": "china-ts",
|
| 32 |
+
"ja": "japan-s",
|
| 33 |
+
"ko": "korea-s",
|
| 34 |
+
}
|
| 35 |
+
noto_list = [
|
| 36 |
+
"am", # Amharic
|
| 37 |
+
"ar", # Arabic
|
| 38 |
+
"bn", # Bengali
|
| 39 |
+
"bg", # Bulgarian
|
| 40 |
+
"chr", # Cherokee
|
| 41 |
+
"el", # Greek
|
| 42 |
+
"gu", # Gujarati
|
| 43 |
+
"iw", # Hebrew
|
| 44 |
+
"hi", # Hindi
|
| 45 |
+
# "ja", # Japanese
|
| 46 |
+
"kn", # Kannada
|
| 47 |
+
# "ko", # Korean
|
| 48 |
+
"ml", # Malayalam
|
| 49 |
+
"mr", # Marathi
|
| 50 |
+
"ru", # Russian
|
| 51 |
+
"sr", # Serbian
|
| 52 |
+
# "zh-CN",# Chinese (PRC)
|
| 53 |
+
"ta", # Tamil
|
| 54 |
+
"te", # Telugu
|
| 55 |
+
"th", # Thai
|
| 56 |
+
# "zh-TW",# Chinese (Taiwan)
|
| 57 |
+
"ur", # Urdu
|
| 58 |
+
"uk", # Ukrainian
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def check_files(files: List[str]) -> List[str]:
|
| 63 |
+
files = [
|
| 64 |
+
f for f in files if not f.startswith("http://")
|
| 65 |
+
] # exclude online files, http
|
| 66 |
+
files = [
|
| 67 |
+
f for f in files if not f.startswith("https://")
|
| 68 |
+
] # exclude online files, https
|
| 69 |
+
missing_files = [file for file in files if not os.path.exists(file)]
|
| 70 |
+
return missing_files
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def extract_text(
|
| 74 |
+
files: Iterable[str] = [],
|
| 75 |
+
pages: Optional[Container[int]] = None,
|
| 76 |
+
password: str = "",
|
| 77 |
+
debug: bool = False,
|
| 78 |
+
vfont: str = "",
|
| 79 |
+
vchar: str = "",
|
| 80 |
+
thread: int = 0,
|
| 81 |
+
lang_in: str = "",
|
| 82 |
+
lang_out: str = "",
|
| 83 |
+
service: str = "",
|
| 84 |
+
callback: object = None,
|
| 85 |
+
output: str = "",
|
| 86 |
+
**kwargs: Any,
|
| 87 |
+
):
|
| 88 |
+
if debug:
|
| 89 |
+
log.setLevel(logging.DEBUG)
|
| 90 |
+
|
| 91 |
+
if not files:
|
| 92 |
+
raise PDFValueError("Must provide files to work upon!")
|
| 93 |
+
|
| 94 |
+
for file in files:
|
| 95 |
+
if file is str and (file.startswith("http://") or file.startswith("https://")):
|
| 96 |
+
print("Online files detected, downloading...")
|
| 97 |
+
try:
|
| 98 |
+
r = requests.get(file, allow_redirects=True)
|
| 99 |
+
if r.status_code == 200:
|
| 100 |
+
if not os.path.exists("./pdf2zh_files"):
|
| 101 |
+
print("Making a temporary dir for downloading PDF files...")
|
| 102 |
+
os.mkdir(os.path.dirname("./pdf2zh_files"))
|
| 103 |
+
with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
|
| 104 |
+
print(f"Writing the file: {file}...")
|
| 105 |
+
f.write(r.content)
|
| 106 |
+
file = "./pdf2zh_files/tmp_download.pdf"
|
| 107 |
+
else:
|
| 108 |
+
r.raise_for_status()
|
| 109 |
+
except Exception as e:
|
| 110 |
+
raise PDFValueError(
|
| 111 |
+
f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
|
| 112 |
+
)
|
| 113 |
+
filename = os.path.splitext(os.path.basename(file))[0]
|
| 114 |
+
|
| 115 |
+
font_list = [("tiro", None)]
|
| 116 |
+
noto = None
|
| 117 |
+
if lang_out in resfont_map: # CJK
|
| 118 |
+
resfont = resfont_map[lang_out]
|
| 119 |
+
font_list.append((resfont, None))
|
| 120 |
+
elif lang_out in noto_list: # noto
|
| 121 |
+
resfont = "noto"
|
| 122 |
+
ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
|
| 123 |
+
if not os.path.exists(ttf_path):
|
| 124 |
+
print("Downloading Noto font...")
|
| 125 |
+
urllib.request.urlretrieve(
|
| 126 |
+
"https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
|
| 127 |
+
ttf_path,
|
| 128 |
+
)
|
| 129 |
+
font_list.append(("noto", ttf_path))
|
| 130 |
+
noto = pymupdf.Font("noto", ttf_path)
|
| 131 |
+
else: # auto
|
| 132 |
+
resfont = "china-ss"
|
| 133 |
+
font_list.append(("china-ss", None))
|
| 134 |
+
|
| 135 |
+
doc_en = pymupdf.open(file)
|
| 136 |
+
page_count = doc_en.page_count
|
| 137 |
+
# font_list = [("china-ss", None), ("tiro", None)]
|
| 138 |
+
font_id = {}
|
| 139 |
+
for page in doc_en:
|
| 140 |
+
for font in font_list:
|
| 141 |
+
font_id[font[0]] = page.insert_font(font[0], font[1])
|
| 142 |
+
xreflen = doc_en.xref_length()
|
| 143 |
+
for xref in range(1, xreflen):
|
| 144 |
+
for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
|
| 145 |
+
try: # xref 读写可能出错
|
| 146 |
+
font_res = doc_en.xref_get_key(xref, f"{label}Font")
|
| 147 |
+
if font_res[0] == "dict":
|
| 148 |
+
for font in font_list:
|
| 149 |
+
font_exist = doc_en.xref_get_key(
|
| 150 |
+
xref, f"{label}Font/{font[0]}"
|
| 151 |
+
)
|
| 152 |
+
if font_exist[0] == "null":
|
| 153 |
+
doc_en.xref_set_key(
|
| 154 |
+
xref,
|
| 155 |
+
f"{label}Font/{font[0]}",
|
| 156 |
+
f"{font_id[font[0]]} 0 R",
|
| 157 |
+
)
|
| 158 |
+
except Exception:
|
| 159 |
+
pass
|
| 160 |
+
doc_en.save(Path(output) / f"{filename}-en.pdf")
|
| 161 |
+
|
| 162 |
+
with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
|
| 163 |
+
obj_patch: dict = extract_text_to_fp(fp, model=model, **locals())
|
| 164 |
+
|
| 165 |
+
for obj_id, ops_new in obj_patch.items():
|
| 166 |
+
# ops_old=doc_en.xref_stream(obj_id)
|
| 167 |
+
# print(obj_id)
|
| 168 |
+
# print(ops_old)
|
| 169 |
+
# print(ops_new.encode())
|
| 170 |
+
doc_en.update_stream(obj_id, ops_new.encode())
|
| 171 |
+
|
| 172 |
+
doc_zh = doc_en
|
| 173 |
+
doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
|
| 174 |
+
doc_dual.insert_file(doc_zh)
|
| 175 |
+
for id in range(page_count):
|
| 176 |
+
doc_dual.move_page(page_count + id, id * 2 + 1)
|
| 177 |
+
doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
|
| 178 |
+
doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
|
| 179 |
+
doc_zh.close()
|
| 180 |
+
doc_dual.close()
|
| 181 |
+
os.remove(Path(output) / f"{filename}-en.pdf")
|
| 182 |
+
|
| 183 |
+
return
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def create_parser() -> argparse.ArgumentParser:
|
| 187 |
+
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
|
| 188 |
+
parser.add_argument(
|
| 189 |
+
"files",
|
| 190 |
+
type=str,
|
| 191 |
+
default=None,
|
| 192 |
+
nargs="*",
|
| 193 |
+
help="One or more paths to PDF files.",
|
| 194 |
+
)
|
| 195 |
+
parser.add_argument(
|
| 196 |
+
"--version",
|
| 197 |
+
"-v",
|
| 198 |
+
action="version",
|
| 199 |
+
version=f"pdf2zh v{__version__}",
|
| 200 |
+
)
|
| 201 |
+
parser.add_argument(
|
| 202 |
+
"--debug",
|
| 203 |
+
"-d",
|
| 204 |
+
default=False,
|
| 205 |
+
action="store_true",
|
| 206 |
+
help="Use debug logging level.",
|
| 207 |
+
)
|
| 208 |
+
parse_params = parser.add_argument_group(
|
| 209 |
+
"Parser",
|
| 210 |
+
description="Used during PDF parsing",
|
| 211 |
+
)
|
| 212 |
+
parse_params.add_argument(
|
| 213 |
+
"--pages",
|
| 214 |
+
"-p",
|
| 215 |
+
type=str,
|
| 216 |
+
help="The list of page numbers to parse.",
|
| 217 |
+
)
|
| 218 |
+
parse_params.add_argument(
|
| 219 |
+
"--password",
|
| 220 |
+
"-P",
|
| 221 |
+
type=str,
|
| 222 |
+
default="",
|
| 223 |
+
help="The password to use for decrypting PDF file.",
|
| 224 |
+
)
|
| 225 |
+
parse_params.add_argument(
|
| 226 |
+
"--vfont",
|
| 227 |
+
"-f",
|
| 228 |
+
type=str,
|
| 229 |
+
default="",
|
| 230 |
+
help="The regex to math font name of formula.",
|
| 231 |
+
)
|
| 232 |
+
parse_params.add_argument(
|
| 233 |
+
"--vchar",
|
| 234 |
+
"-c",
|
| 235 |
+
type=str,
|
| 236 |
+
default="",
|
| 237 |
+
help="The regex to math character of formula.",
|
| 238 |
+
)
|
| 239 |
+
parse_params.add_argument(
|
| 240 |
+
"--lang-in",
|
| 241 |
+
"-li",
|
| 242 |
+
type=str,
|
| 243 |
+
default="auto",
|
| 244 |
+
help="The code of source language.",
|
| 245 |
+
)
|
| 246 |
+
parse_params.add_argument(
|
| 247 |
+
"--lang-out",
|
| 248 |
+
"-lo",
|
| 249 |
+
type=str,
|
| 250 |
+
default="auto",
|
| 251 |
+
help="The code of target language.",
|
| 252 |
+
)
|
| 253 |
+
parse_params.add_argument(
|
| 254 |
+
"--service",
|
| 255 |
+
"-s",
|
| 256 |
+
type=str,
|
| 257 |
+
default="google",
|
| 258 |
+
help="The service to use for translation.",
|
| 259 |
+
)
|
| 260 |
+
parse_params.add_argument(
|
| 261 |
+
"--output",
|
| 262 |
+
"-o",
|
| 263 |
+
type=str,
|
| 264 |
+
default="",
|
| 265 |
+
help="Output directory for files.",
|
| 266 |
+
)
|
| 267 |
+
parse_params.add_argument(
|
| 268 |
+
"--thread",
|
| 269 |
+
"-t",
|
| 270 |
+
type=int,
|
| 271 |
+
default=4,
|
| 272 |
+
help="The number of threads to execute translation.",
|
| 273 |
+
)
|
| 274 |
+
parse_params.add_argument(
|
| 275 |
+
"--interactive",
|
| 276 |
+
"-i",
|
| 277 |
+
action="store_true",
|
| 278 |
+
help="Interact with GUI.",
|
| 279 |
+
)
|
| 280 |
+
parse_params.add_argument(
|
| 281 |
+
"--share",
|
| 282 |
+
action="store_true",
|
| 283 |
+
help="Enable Gradio Share",
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
return parser
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
|
| 290 |
+
parsed_args = create_parser().parse_args(args=args)
|
| 291 |
+
|
| 292 |
+
if parsed_args.pages:
|
| 293 |
+
pages = []
|
| 294 |
+
for p in parsed_args.pages.split(","):
|
| 295 |
+
if "-" in p:
|
| 296 |
+
start, end = p.split("-")
|
| 297 |
+
pages.extend(range(int(start) - 1, int(end)))
|
| 298 |
+
else:
|
| 299 |
+
pages.append(int(p) - 1)
|
| 300 |
+
parsed_args.pages = pages
|
| 301 |
+
|
| 302 |
+
return parsed_args
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def main(args: Optional[List[str]] = None) -> int:
|
| 306 |
+
parsed_args = parse_args(args)
|
| 307 |
+
|
| 308 |
+
missing_files = check_files(parsed_args.files)
|
| 309 |
+
if missing_files:
|
| 310 |
+
print("The following files do not exist:", file=sys.stderr)
|
| 311 |
+
for file in missing_files:
|
| 312 |
+
print(f" {file}", file=sys.stderr)
|
| 313 |
+
return -1
|
| 314 |
+
if parsed_args.interactive:
|
| 315 |
+
from pdf2zh.gui import setup_gui
|
| 316 |
+
|
| 317 |
+
setup_gui(parsed_args.share)
|
| 318 |
+
return 0
|
| 319 |
+
|
| 320 |
+
extract_text(**vars(parsed_args))
|
| 321 |
+
return 0
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
if __name__ == "__main__":
|
| 325 |
+
sys.exit(main())
|
pdf2zh/pdfinterp.py
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Any, Dict, Optional, Sequence, Tuple, cast
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
from pdfminer import settings
|
| 6 |
+
from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
|
| 7 |
+
from pdfminer.pdfdevice import PDFDevice
|
| 8 |
+
from pdfminer.pdfinterp import (
|
| 9 |
+
PDFPageInterpreter,
|
| 10 |
+
PDFResourceManager,
|
| 11 |
+
PDFContentParser,
|
| 12 |
+
PDFInterpreterError,
|
| 13 |
+
Color,
|
| 14 |
+
PDFStackT,
|
| 15 |
+
LITERAL_FORM,
|
| 16 |
+
LITERAL_IMAGE,
|
| 17 |
+
)
|
| 18 |
+
from pdfminer.pdffont import PDFFont
|
| 19 |
+
from pdfminer.pdfpage import PDFPage
|
| 20 |
+
from pdfminer.pdftypes import (
|
| 21 |
+
PDFObjRef,
|
| 22 |
+
dict_value,
|
| 23 |
+
list_value,
|
| 24 |
+
resolve1,
|
| 25 |
+
stream_value,
|
| 26 |
+
)
|
| 27 |
+
from pdfminer.psexceptions import PSEOF
|
| 28 |
+
from pdfminer.psparser import (
|
| 29 |
+
PSKeyword,
|
| 30 |
+
keyword_name,
|
| 31 |
+
literal_name,
|
| 32 |
+
)
|
| 33 |
+
from pdfminer.utils import (
|
| 34 |
+
MATRIX_IDENTITY,
|
| 35 |
+
Matrix,
|
| 36 |
+
Rect,
|
| 37 |
+
mult_matrix,
|
| 38 |
+
apply_matrix_pt,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
log = logging.getLogger(__name__)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def safe_float(o: Any) -> Optional[float]:
|
| 45 |
+
try:
|
| 46 |
+
return float(o)
|
| 47 |
+
except (TypeError, ValueError):
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class PDFPageInterpreterEx(PDFPageInterpreter):
|
| 52 |
+
"""Processor for the content of a PDF page
|
| 53 |
+
|
| 54 |
+
Reference: PDF Reference, Appendix A, Operator Summary
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
def __init__(
|
| 58 |
+
self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch
|
| 59 |
+
) -> None:
|
| 60 |
+
self.rsrcmgr = rsrcmgr
|
| 61 |
+
self.device = device
|
| 62 |
+
self.obj_patch = obj_patch
|
| 63 |
+
|
| 64 |
+
def dup(self) -> "PDFPageInterpreterEx":
|
| 65 |
+
return self.__class__(self.rsrcmgr, self.device, self.obj_patch)
|
| 66 |
+
|
| 67 |
+
def init_resources(self, resources: Dict[object, object]) -> None:
|
| 68 |
+
# 重载设置 fontid 和 descent
|
| 69 |
+
"""Prepare the fonts and XObjects listed in the Resource attribute."""
|
| 70 |
+
self.resources = resources
|
| 71 |
+
self.fontmap: Dict[object, PDFFont] = {}
|
| 72 |
+
self.fontid: Dict[PDFFont, object] = {}
|
| 73 |
+
self.xobjmap = {}
|
| 74 |
+
self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
|
| 75 |
+
if not resources:
|
| 76 |
+
return
|
| 77 |
+
|
| 78 |
+
def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
|
| 79 |
+
if isinstance(spec, list):
|
| 80 |
+
name = literal_name(spec[0])
|
| 81 |
+
else:
|
| 82 |
+
name = literal_name(spec)
|
| 83 |
+
if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
|
| 84 |
+
return PDFColorSpace(name, stream_value(spec[1])["N"])
|
| 85 |
+
elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
|
| 86 |
+
return PDFColorSpace(name, len(list_value(spec[1])))
|
| 87 |
+
else:
|
| 88 |
+
return PREDEFINED_COLORSPACE.get(name)
|
| 89 |
+
|
| 90 |
+
for k, v in dict_value(resources).items():
|
| 91 |
+
# log.debug("Resource: %r: %r", k, v)
|
| 92 |
+
if k == "Font":
|
| 93 |
+
for fontid, spec in dict_value(v).items():
|
| 94 |
+
objid = None
|
| 95 |
+
if isinstance(spec, PDFObjRef):
|
| 96 |
+
objid = spec.objid
|
| 97 |
+
spec = dict_value(spec)
|
| 98 |
+
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
| 99 |
+
self.fontmap[fontid].descent = 0 # hack fix descent
|
| 100 |
+
self.fontid[self.fontmap[fontid]] = fontid
|
| 101 |
+
elif k == "ColorSpace":
|
| 102 |
+
for csid, spec in dict_value(v).items():
|
| 103 |
+
colorspace = get_colorspace(resolve1(spec))
|
| 104 |
+
if colorspace is not None:
|
| 105 |
+
self.csmap[csid] = colorspace
|
| 106 |
+
elif k == "ProcSet":
|
| 107 |
+
self.rsrcmgr.get_procset(list_value(v))
|
| 108 |
+
elif k == "XObject":
|
| 109 |
+
for xobjid, xobjstrm in dict_value(v).items():
|
| 110 |
+
self.xobjmap[xobjid] = xobjstrm
|
| 111 |
+
|
| 112 |
+
def do_S(self) -> None:
|
| 113 |
+
# 重载过滤非公式线条
|
| 114 |
+
"""Stroke path"""
|
| 115 |
+
|
| 116 |
+
def is_black(color: Color) -> bool:
|
| 117 |
+
if isinstance(color, Tuple):
|
| 118 |
+
return sum(color) == 0
|
| 119 |
+
else:
|
| 120 |
+
return color == 0
|
| 121 |
+
|
| 122 |
+
if (
|
| 123 |
+
len(self.curpath) == 2
|
| 124 |
+
and self.curpath[0][0] == "m"
|
| 125 |
+
and self.curpath[1][0] == "l"
|
| 126 |
+
and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1]
|
| 127 |
+
== apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1]
|
| 128 |
+
and is_black(self.graphicstate.scolor)
|
| 129 |
+
): # 独立直线,水平,黑色
|
| 130 |
+
# print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor)
|
| 131 |
+
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
|
| 132 |
+
self.curpath = []
|
| 133 |
+
return "n"
|
| 134 |
+
else:
|
| 135 |
+
self.curpath = []
|
| 136 |
+
|
| 137 |
+
############################################################
|
| 138 |
+
# 重载过滤非公式线条(F/B)
|
| 139 |
+
def do_f(self) -> None:
|
| 140 |
+
"""Fill path using nonzero winding number rule"""
|
| 141 |
+
# self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
|
| 142 |
+
self.curpath = []
|
| 143 |
+
|
| 144 |
+
def do_F(self) -> None:
|
| 145 |
+
"""Fill path using nonzero winding number rule (obsolete)"""
|
| 146 |
+
|
| 147 |
+
def do_f_a(self) -> None:
|
| 148 |
+
"""Fill path using even-odd rule"""
|
| 149 |
+
# self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
|
| 150 |
+
self.curpath = []
|
| 151 |
+
|
| 152 |
+
def do_B(self) -> None:
|
| 153 |
+
"""Fill and stroke path using nonzero winding number rule"""
|
| 154 |
+
# self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
|
| 155 |
+
self.curpath = []
|
| 156 |
+
|
| 157 |
+
def do_B_a(self) -> None:
|
| 158 |
+
"""Fill and stroke path using even-odd rule"""
|
| 159 |
+
# self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
|
| 160 |
+
self.curpath = []
|
| 161 |
+
|
| 162 |
+
############################################################
|
| 163 |
+
# 重载返回调用参数(SCN)
|
| 164 |
+
def do_SCN(self) -> None:
|
| 165 |
+
"""Set color for stroking operations."""
|
| 166 |
+
if self.scs:
|
| 167 |
+
n = self.scs.ncomponents
|
| 168 |
+
else:
|
| 169 |
+
if settings.STRICT:
|
| 170 |
+
raise PDFInterpreterError("No colorspace specified!")
|
| 171 |
+
n = 1
|
| 172 |
+
args = self.pop(n)
|
| 173 |
+
self.graphicstate.scolor = cast(Color, args)
|
| 174 |
+
return args
|
| 175 |
+
|
| 176 |
+
def do_scn(self) -> None:
|
| 177 |
+
"""Set color for nonstroking operations"""
|
| 178 |
+
if self.ncs:
|
| 179 |
+
n = self.ncs.ncomponents
|
| 180 |
+
else:
|
| 181 |
+
if settings.STRICT:
|
| 182 |
+
raise PDFInterpreterError("No colorspace specified!")
|
| 183 |
+
n = 1
|
| 184 |
+
args = self.pop(n)
|
| 185 |
+
self.graphicstate.ncolor = cast(Color, args)
|
| 186 |
+
return args
|
| 187 |
+
|
| 188 |
+
def do_SC(self) -> None:
|
| 189 |
+
"""Set color for stroking operations"""
|
| 190 |
+
return self.do_SCN()
|
| 191 |
+
|
| 192 |
+
def do_sc(self) -> None:
|
| 193 |
+
"""Set color for nonstroking operations"""
|
| 194 |
+
return self.do_scn()
|
| 195 |
+
|
| 196 |
+
def do_Do(self, xobjid_arg: PDFStackT) -> None:
|
| 197 |
+
# 重载设置 xobj 的 obj_patch
|
| 198 |
+
"""Invoke named XObject"""
|
| 199 |
+
xobjid = literal_name(xobjid_arg)
|
| 200 |
+
try:
|
| 201 |
+
xobj = stream_value(self.xobjmap[xobjid])
|
| 202 |
+
except KeyError:
|
| 203 |
+
if settings.STRICT:
|
| 204 |
+
raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
|
| 205 |
+
return
|
| 206 |
+
# log.debug("Processing xobj: %r", xobj)
|
| 207 |
+
subtype = xobj.get("Subtype")
|
| 208 |
+
if subtype is LITERAL_FORM and "BBox" in xobj:
|
| 209 |
+
interpreter = self.dup()
|
| 210 |
+
bbox = cast(Rect, list_value(xobj["BBox"]))
|
| 211 |
+
matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
|
| 212 |
+
# According to PDF reference 1.7 section 4.9.1, XObjects in
|
| 213 |
+
# earlier PDFs (prior to v1.2) use the page's Resources entry
|
| 214 |
+
# instead of having their own Resources entry.
|
| 215 |
+
xobjres = xobj.get("Resources")
|
| 216 |
+
if xobjres:
|
| 217 |
+
resources = dict_value(xobjres)
|
| 218 |
+
else:
|
| 219 |
+
resources = self.resources.copy()
|
| 220 |
+
self.device.begin_figure(xobjid, bbox, matrix)
|
| 221 |
+
ctm = mult_matrix(matrix, self.ctm)
|
| 222 |
+
ops_base = interpreter.render_contents(
|
| 223 |
+
resources,
|
| 224 |
+
[xobj],
|
| 225 |
+
ctm=ctm,
|
| 226 |
+
)
|
| 227 |
+
try: # 有的时候 form 字体加不上这里会烂掉
|
| 228 |
+
self.device.fontid = interpreter.fontid
|
| 229 |
+
self.device.fontmap = interpreter.fontmap
|
| 230 |
+
ops_new = self.device.end_figure(xobjid)
|
| 231 |
+
ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2))
|
| 232 |
+
pos_inv = -np.mat(ctm[4:]) * ctm_inv
|
| 233 |
+
a, b, c, d = ctm_inv.reshape(4).tolist()
|
| 234 |
+
e, f = pos_inv.tolist()[0]
|
| 235 |
+
self.obj_patch[self.xobjmap[xobjid].objid] = (
|
| 236 |
+
f"q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}"
|
| 237 |
+
)
|
| 238 |
+
except Exception:
|
| 239 |
+
pass
|
| 240 |
+
elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
|
| 241 |
+
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
| 242 |
+
self.device.render_image(xobjid, xobj)
|
| 243 |
+
self.device.end_figure(xobjid)
|
| 244 |
+
else:
|
| 245 |
+
# unsupported xobject type.
|
| 246 |
+
pass
|
| 247 |
+
|
| 248 |
+
def process_page(self, page: PDFPage) -> None:
|
| 249 |
+
# 重载设置 page 的 obj_patch
|
| 250 |
+
# log.debug("Processing page: %r", page)
|
| 251 |
+
# print(page.mediabox,page.cropbox)
|
| 252 |
+
# (x0, y0, x1, y1) = page.mediabox
|
| 253 |
+
(x0, y0, x1, y1) = page.cropbox
|
| 254 |
+
if page.rotate == 90:
|
| 255 |
+
ctm = (0, -1, 1, 0, -y0, x1)
|
| 256 |
+
elif page.rotate == 180:
|
| 257 |
+
ctm = (-1, 0, 0, -1, x1, y1)
|
| 258 |
+
elif page.rotate == 270:
|
| 259 |
+
ctm = (0, 1, -1, 0, y1, -x0)
|
| 260 |
+
else:
|
| 261 |
+
ctm = (1, 0, 0, 1, -x0, -y0)
|
| 262 |
+
self.device.begin_page(page, ctm)
|
| 263 |
+
ops_base = self.render_contents(page.resources, page.contents, ctm=ctm)
|
| 264 |
+
self.device.fontid = self.fontid
|
| 265 |
+
self.device.fontmap = self.fontmap
|
| 266 |
+
ops_new = self.device.end_page(page)
|
| 267 |
+
# 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来
|
| 268 |
+
self.obj_patch[page.page_xref] = (
|
| 269 |
+
f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵
|
| 270 |
+
)
|
| 271 |
+
for obj in page.contents:
|
| 272 |
+
self.obj_patch[obj.objid] = ""
|
| 273 |
+
|
| 274 |
+
def render_contents(
|
| 275 |
+
self,
|
| 276 |
+
resources: Dict[object, object],
|
| 277 |
+
streams: Sequence[object],
|
| 278 |
+
ctm: Matrix = MATRIX_IDENTITY,
|
| 279 |
+
) -> None:
|
| 280 |
+
# 重载返回指令流
|
| 281 |
+
"""Render the content streams.
|
| 282 |
+
|
| 283 |
+
This method may be called recursively.
|
| 284 |
+
"""
|
| 285 |
+
# log.debug(
|
| 286 |
+
# "render_contents: resources=%r, streams=%r, ctm=%r",
|
| 287 |
+
# resources,
|
| 288 |
+
# streams,
|
| 289 |
+
# ctm,
|
| 290 |
+
# )
|
| 291 |
+
self.init_resources(resources)
|
| 292 |
+
self.init_state(ctm)
|
| 293 |
+
return self.execute(list_value(streams))
|
| 294 |
+
|
| 295 |
+
def execute(self, streams: Sequence[object]) -> None:
|
| 296 |
+
# 重载返回指令流
|
| 297 |
+
ops = ""
|
| 298 |
+
try:
|
| 299 |
+
parser = PDFContentParser(streams)
|
| 300 |
+
except PSEOF:
|
| 301 |
+
# empty page
|
| 302 |
+
return
|
| 303 |
+
while True:
|
| 304 |
+
try:
|
| 305 |
+
(_, obj) = parser.nextobject()
|
| 306 |
+
except PSEOF:
|
| 307 |
+
break
|
| 308 |
+
if isinstance(obj, PSKeyword):
|
| 309 |
+
name = keyword_name(obj)
|
| 310 |
+
method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
|
| 311 |
+
"'",
|
| 312 |
+
"_q",
|
| 313 |
+
)
|
| 314 |
+
if hasattr(self, method):
|
| 315 |
+
func = getattr(self, method)
|
| 316 |
+
nargs = func.__code__.co_argcount - 1
|
| 317 |
+
if nargs:
|
| 318 |
+
args = self.pop(nargs)
|
| 319 |
+
# log.debug("exec: %s %r", name, args)
|
| 320 |
+
if len(args) == nargs:
|
| 321 |
+
func(*args)
|
| 322 |
+
if not (
|
| 323 |
+
name[0] == "T"
|
| 324 |
+
or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"]
|
| 325 |
+
): # 过滤 T 系列文字指令,因为 EI 的参数是 obj 所以也需要过滤(只在少数文档中画横线时使用),过滤 marked 系列指令
|
| 326 |
+
p = " ".join(
|
| 327 |
+
[
|
| 328 |
+
(
|
| 329 |
+
f"{x:f}"
|
| 330 |
+
if isinstance(x, float)
|
| 331 |
+
else str(x).replace("'", "")
|
| 332 |
+
)
|
| 333 |
+
for x in args
|
| 334 |
+
]
|
| 335 |
+
)
|
| 336 |
+
ops += f"{p} {name} "
|
| 337 |
+
else:
|
| 338 |
+
# log.debug("exec: %s", name)
|
| 339 |
+
targs = func()
|
| 340 |
+
if targs is None:
|
| 341 |
+
targs = []
|
| 342 |
+
if not (name[0] == "T" or name in ["BI", "ID", "EMC"]):
|
| 343 |
+
p = " ".join(
|
| 344 |
+
[
|
| 345 |
+
(
|
| 346 |
+
f"{x:f}"
|
| 347 |
+
if isinstance(x, float)
|
| 348 |
+
else str(x).replace("'", "")
|
| 349 |
+
)
|
| 350 |
+
for x in targs
|
| 351 |
+
]
|
| 352 |
+
)
|
| 353 |
+
ops += f"{p} {name} "
|
| 354 |
+
elif settings.STRICT:
|
| 355 |
+
error_msg = "Unknown operator: %r" % name
|
| 356 |
+
raise PDFInterpreterError(error_msg)
|
| 357 |
+
else:
|
| 358 |
+
self.push(obj)
|
| 359 |
+
# print('REV DATA',ops)
|
| 360 |
+
return ops
|
pdf2zh/translator.py
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
import hmac
|
| 3 |
+
import html
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
import time
|
| 8 |
+
from datetime import timezone, datetime
|
| 9 |
+
|
| 10 |
+
from json import dumps, loads
|
| 11 |
+
import unicodedata
|
| 12 |
+
|
| 13 |
+
import deepl
|
| 14 |
+
import ollama
|
| 15 |
+
import openai
|
| 16 |
+
import requests
|
| 17 |
+
from azure.ai.translation.text import TextTranslationClient
|
| 18 |
+
from azure.core.credentials import AzureKeyCredential
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def remove_control_characters(s):
|
| 22 |
+
return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class BaseTranslator:
|
| 26 |
+
def __init__(self, service, lang_out, lang_in, model):
|
| 27 |
+
self.service = service
|
| 28 |
+
self.lang_out = lang_out
|
| 29 |
+
self.lang_in = lang_in
|
| 30 |
+
self.model = model
|
| 31 |
+
|
| 32 |
+
def translate(self, text) -> str: ... # noqa: E704
|
| 33 |
+
|
| 34 |
+
def __str__(self):
|
| 35 |
+
return f"{self.service} {self.lang_out} {self.lang_in}"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class GoogleTranslator(BaseTranslator):
|
| 39 |
+
def __init__(self, service, lang_out, lang_in, model):
|
| 40 |
+
lang_out = "zh-CN" if lang_out == "auto" else lang_out
|
| 41 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
| 42 |
+
super().__init__(service, lang_out, lang_in, model)
|
| 43 |
+
self.session = requests.Session()
|
| 44 |
+
self.base_link = "http://translate.google.com/m"
|
| 45 |
+
self.headers = {
|
| 46 |
+
"User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
def translate(self, text):
|
| 50 |
+
text = text[:5000] # google translate max length
|
| 51 |
+
response = self.session.get(
|
| 52 |
+
self.base_link,
|
| 53 |
+
params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
|
| 54 |
+
headers=self.headers,
|
| 55 |
+
)
|
| 56 |
+
re_result = re.findall(
|
| 57 |
+
r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
|
| 58 |
+
)
|
| 59 |
+
if response.status_code == 400:
|
| 60 |
+
result = "IRREPARABLE TRANSLATION ERROR"
|
| 61 |
+
elif len(re_result) == 0:
|
| 62 |
+
raise ValueError("Empty translation result")
|
| 63 |
+
else:
|
| 64 |
+
result = html.unescape(re_result[0])
|
| 65 |
+
return remove_control_characters(result)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class TencentTranslator(BaseTranslator):
|
| 69 |
+
def sign(self, key, msg):
|
| 70 |
+
return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest()
|
| 71 |
+
|
| 72 |
+
def __init__(self, service, lang_out, lang_in, model):
|
| 73 |
+
lang_out = "zh" if lang_out == "auto" else lang_out
|
| 74 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
| 75 |
+
super().__init__(service, lang_out, lang_in, model)
|
| 76 |
+
try:
|
| 77 |
+
server_url = "tmt.tencentcloudapi.com"
|
| 78 |
+
self.secret_id = os.getenv("TENCENT_SECRET_ID")
|
| 79 |
+
self.secret_key = os.getenv("TENCENT_SECRET_KEY")
|
| 80 |
+
|
| 81 |
+
except KeyError as e:
|
| 82 |
+
missing_var = e.args[0]
|
| 83 |
+
raise ValueError(
|
| 84 |
+
f"The environment variable '{missing_var}' is required but not set."
|
| 85 |
+
) from e
|
| 86 |
+
|
| 87 |
+
self.session = requests.Session()
|
| 88 |
+
self.base_link = f"{server_url}"
|
| 89 |
+
|
| 90 |
+
def translate(self, text):
|
| 91 |
+
text = text[:5000]
|
| 92 |
+
data = {
|
| 93 |
+
"SourceText": text,
|
| 94 |
+
"Source": self.lang_in,
|
| 95 |
+
"Target": self.lang_out,
|
| 96 |
+
"ProjectId": 0,
|
| 97 |
+
}
|
| 98 |
+
payloadx = dumps(data)
|
| 99 |
+
hashed_request_payload = hashlib.sha256(payloadx.encode("utf-8")).hexdigest()
|
| 100 |
+
canonical_request = (
|
| 101 |
+
"POST"
|
| 102 |
+
+ "\n"
|
| 103 |
+
+ "/"
|
| 104 |
+
+ "\n"
|
| 105 |
+
+ ""
|
| 106 |
+
+ "\n"
|
| 107 |
+
+ "content-type:application/json; charset=utf-8\nhost:tmt.tencentcloudapi.com\nx-tc-action:texttranslate\n"
|
| 108 |
+
+ "\n"
|
| 109 |
+
+ "content-type;host;x-tc-action"
|
| 110 |
+
+ "\n"
|
| 111 |
+
+ hashed_request_payload
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
timestamp = int(time.time())
|
| 115 |
+
date = datetime.fromtimestamp(timestamp, timezone.utc).strftime("%Y-%m-%d")
|
| 116 |
+
credential_scope = date + "/tmt/tc3_request"
|
| 117 |
+
hashed_canonical_request = hashlib.sha256(
|
| 118 |
+
canonical_request.encode("utf-8")
|
| 119 |
+
).hexdigest()
|
| 120 |
+
algorithm = "TC3-HMAC-SHA256"
|
| 121 |
+
string_to_sign = (
|
| 122 |
+
algorithm
|
| 123 |
+
+ "\n"
|
| 124 |
+
+ str(timestamp)
|
| 125 |
+
+ "\n"
|
| 126 |
+
+ credential_scope
|
| 127 |
+
+ "\n"
|
| 128 |
+
+ hashed_canonical_request
|
| 129 |
+
)
|
| 130 |
+
secret_date = self.sign(("TC3" + str(self.secret_key)).encode("utf-8"), date)
|
| 131 |
+
secret_service = self.sign(secret_date, "tmt")
|
| 132 |
+
secret_signing = self.sign(secret_service, "tc3_request")
|
| 133 |
+
signed_headers = "content-type;host;x-tc-action"
|
| 134 |
+
signature = hmac.new(
|
| 135 |
+
secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256
|
| 136 |
+
).hexdigest()
|
| 137 |
+
authorization = (
|
| 138 |
+
algorithm
|
| 139 |
+
+ " "
|
| 140 |
+
+ "Credential="
|
| 141 |
+
+ str(self.secret_id)
|
| 142 |
+
+ "/"
|
| 143 |
+
+ credential_scope
|
| 144 |
+
+ ", "
|
| 145 |
+
+ "SignedHeaders="
|
| 146 |
+
+ signed_headers
|
| 147 |
+
+ ", "
|
| 148 |
+
+ "Signature="
|
| 149 |
+
+ signature
|
| 150 |
+
)
|
| 151 |
+
self.headers = {
|
| 152 |
+
"Authorization": authorization,
|
| 153 |
+
"Content-Type": "application/json; charset=utf-8",
|
| 154 |
+
"Host": "tmt.tencentcloudapi.com",
|
| 155 |
+
"X-TC-Action": "TextTranslate",
|
| 156 |
+
"X-TC-Region": "ap-beijing",
|
| 157 |
+
"X-TC-Timestamp": str(timestamp),
|
| 158 |
+
"X-TC-Version": "2018-03-21",
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
response = self.session.post(
|
| 162 |
+
"https://" + self.base_link,
|
| 163 |
+
json=data,
|
| 164 |
+
headers=self.headers,
|
| 165 |
+
)
|
| 166 |
+
# 1. Status code test
|
| 167 |
+
if response.status_code == 200:
|
| 168 |
+
result = loads(response.text)
|
| 169 |
+
else:
|
| 170 |
+
raise ValueError("HTTP error: " + str(response.status_code))
|
| 171 |
+
# 2. Result test
|
| 172 |
+
try:
|
| 173 |
+
result = result["Response"]["TargetText"]
|
| 174 |
+
# return result
|
| 175 |
+
except KeyError:
|
| 176 |
+
result = ""
|
| 177 |
+
# raise ValueError("No valid key in Tencent's response")
|
| 178 |
+
# # 3. Result length check
|
| 179 |
+
# if len(result) == 0:
|
| 180 |
+
# raise ValueError("Empty translation result")
|
| 181 |
+
return result
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
class DeepLXTranslator(BaseTranslator):
|
| 185 |
+
def __init__(self, service, lang_out, lang_in, model):
|
| 186 |
+
lang_out = "zh" if lang_out == "auto" else lang_out
|
| 187 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
| 188 |
+
super().__init__(service, lang_out, lang_in, model)
|
| 189 |
+
try:
|
| 190 |
+
auth_key = os.getenv("DEEPLX_AUTH_KEY")
|
| 191 |
+
server_url = (
|
| 192 |
+
"https://api.deeplx.org"
|
| 193 |
+
if not os.getenv("DEEPLX_SERVER_URL")
|
| 194 |
+
else os.getenv("DEEPLX_SERVER_URL")
|
| 195 |
+
)
|
| 196 |
+
except KeyError as e:
|
| 197 |
+
missing_var = e.args[0]
|
| 198 |
+
raise ValueError(
|
| 199 |
+
f"The environment variable '{missing_var}' is required but not set."
|
| 200 |
+
) from e
|
| 201 |
+
|
| 202 |
+
self.session = requests.Session()
|
| 203 |
+
server_url = str(server_url).rstrip("/")
|
| 204 |
+
if auth_key:
|
| 205 |
+
self.base_link = f"{server_url}/{auth_key}/translate"
|
| 206 |
+
else:
|
| 207 |
+
self.base_link = f"{server_url}/translate"
|
| 208 |
+
self.headers = {
|
| 209 |
+
"User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
def translate(self, text):
|
| 213 |
+
text = text[:5000] # google translate max length
|
| 214 |
+
response = self.session.post(
|
| 215 |
+
self.base_link,
|
| 216 |
+
dumps(
|
| 217 |
+
{
|
| 218 |
+
"target_lang": self.lang_out,
|
| 219 |
+
"text": text,
|
| 220 |
+
}
|
| 221 |
+
),
|
| 222 |
+
headers=self.headers,
|
| 223 |
+
)
|
| 224 |
+
# 1. Status code test
|
| 225 |
+
if response.status_code == 200:
|
| 226 |
+
result = loads(response.text)
|
| 227 |
+
else:
|
| 228 |
+
raise ValueError("HTTP error: " + str(response.status_code))
|
| 229 |
+
# 2. Result test
|
| 230 |
+
try:
|
| 231 |
+
result = result["data"]
|
| 232 |
+
return result
|
| 233 |
+
except KeyError:
|
| 234 |
+
result = ""
|
| 235 |
+
raise ValueError("No valid key in DeepLX's response")
|
| 236 |
+
# 3. Result length check
|
| 237 |
+
if len(result) == 0:
|
| 238 |
+
raise ValueError("Empty translation result")
|
| 239 |
+
return result
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
class DeepLTranslator(BaseTranslator):
|
| 243 |
+
def __init__(self, service, lang_out, lang_in, model):
|
| 244 |
+
lang_out = "ZH" if lang_out == "auto" else lang_out
|
| 245 |
+
lang_in = "EN" if lang_in == "auto" else lang_in
|
| 246 |
+
super().__init__(service, lang_out, lang_in, model)
|
| 247 |
+
self.session = requests.Session()
|
| 248 |
+
auth_key = os.getenv("DEEPL_AUTH_KEY")
|
| 249 |
+
server_url = os.getenv("DEEPL_SERVER_URL")
|
| 250 |
+
self.client = deepl.Translator(auth_key, server_url=server_url)
|
| 251 |
+
|
| 252 |
+
def translate(self, text):
|
| 253 |
+
response = self.client.translate_text(
|
| 254 |
+
text, target_lang=self.lang_out, source_lang=self.lang_in
|
| 255 |
+
)
|
| 256 |
+
return response.text
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
class OllamaTranslator(BaseTranslator):
|
| 260 |
+
def __init__(self, service, lang_out, lang_in, model):
|
| 261 |
+
lang_out = "zh-CN" if lang_out == "auto" else lang_out
|
| 262 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
| 263 |
+
super().__init__(service, lang_out, lang_in, model)
|
| 264 |
+
self.options = {"temperature": 0} # 随机采样可能会打断公式标记
|
| 265 |
+
# OLLAMA_HOST
|
| 266 |
+
self.client = ollama.Client()
|
| 267 |
+
|
| 268 |
+
def translate(self, text):
|
| 269 |
+
response = self.client.chat(
|
| 270 |
+
model=self.model,
|
| 271 |
+
options=self.options,
|
| 272 |
+
messages=[
|
| 273 |
+
{
|
| 274 |
+
"role": "system",
|
| 275 |
+
"content": "You are a professional,authentic machine translation engine.",
|
| 276 |
+
},
|
| 277 |
+
{
|
| 278 |
+
"role": "user",
|
| 279 |
+
"content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
|
| 280 |
+
},
|
| 281 |
+
],
|
| 282 |
+
)
|
| 283 |
+
return response["message"]["content"].strip()
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
class OpenAITranslator(BaseTranslator):
|
| 287 |
+
def __init__(self, service, lang_out, lang_in, model):
|
| 288 |
+
lang_out = "zh-CN" if lang_out == "auto" else lang_out
|
| 289 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
| 290 |
+
super().__init__(service, lang_out, lang_in, model)
|
| 291 |
+
self.options = {"temperature": 0} # 随机采样可能会打断公式标记
|
| 292 |
+
# OPENAI_BASE_URL
|
| 293 |
+
# OPENAI_API_KEY
|
| 294 |
+
self.client = openai.OpenAI()
|
| 295 |
+
|
| 296 |
+
def translate(self, text) -> str:
|
| 297 |
+
response = self.client.chat.completions.create(
|
| 298 |
+
model=self.model,
|
| 299 |
+
**self.options,
|
| 300 |
+
messages=[
|
| 301 |
+
{
|
| 302 |
+
"role": "system",
|
| 303 |
+
"content": "You are a professional,authentic machine translation engine.",
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"role": "user",
|
| 307 |
+
"content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
|
| 308 |
+
},
|
| 309 |
+
],
|
| 310 |
+
)
|
| 311 |
+
return response.choices[0].message.content.strip()
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
class AzureTranslator(BaseTranslator):
|
| 315 |
+
def __init__(self, service, lang_out, lang_in, model):
|
| 316 |
+
lang_out = "zh-Hans" if lang_out == "auto" else lang_out
|
| 317 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
| 318 |
+
super().__init__(service, lang_out, lang_in, model)
|
| 319 |
+
|
| 320 |
+
try:
|
| 321 |
+
api_key = os.environ["AZURE_APIKEY"]
|
| 322 |
+
endpoint = os.environ["AZURE_ENDPOINT"]
|
| 323 |
+
region = os.environ["AZURE_REGION"]
|
| 324 |
+
except KeyError as e:
|
| 325 |
+
missing_var = e.args[0]
|
| 326 |
+
raise ValueError(
|
| 327 |
+
f"The environment variable '{missing_var}' is required but not set."
|
| 328 |
+
) from e
|
| 329 |
+
|
| 330 |
+
credential = AzureKeyCredential(api_key)
|
| 331 |
+
self.client = TextTranslationClient(
|
| 332 |
+
endpoint=endpoint, credential=credential, region=region
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
# https://github.com/Azure/azure-sdk-for-python/issues/9422
|
| 336 |
+
logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
|
| 337 |
+
logger.setLevel(logging.WARNING)
|
| 338 |
+
|
| 339 |
+
def translate(self, text) -> str:
|
| 340 |
+
response = self.client.translate(
|
| 341 |
+
body=[text],
|
| 342 |
+
from_language=self.lang_in,
|
| 343 |
+
to_language=[self.lang_out],
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
translated_text = response[0].translations[0].text
|
| 347 |
+
return translated_text
|