Taha Mahmood
Initial upload
754d92a
import enum
import functools
import logging
import re
from pathlib import Path
import pymupdf
from babeldoc.assets import assets
from babeldoc.format.pdf.document_il import PdfFont
from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.format.pdf.translation_config import TranslationConfig
logger = logging.getLogger(__name__)
class PrimaryFontFamily(enum.IntEnum):
SERIF = 1
SANS_SERIF = 2
SCRIPT = 3
NONE = 4
@classmethod
def from_str(cls, value: str):
if value == "serif":
return cls.SERIF
elif value == "sans-serif":
return cls.SANS_SERIF
elif value == "script":
return cls.SCRIPT
else:
return cls.NONE
class FontMapper:
stage_name = "Add Fonts"
def __init__(self, translation_config: TranslationConfig):
self.translation_config = translation_config
assert translation_config.primary_font_family in [
None,
"serif",
"sans-serif",
"script",
]
self.primary_font_family = PrimaryFontFamily.from_str(
translation_config.primary_font_family,
)
font_family = assets.get_font_family(translation_config.lang_out)
self.font_file_names = []
for k in (
"normal",
"script",
"fallback",
"base",
):
self.font_file_names.extend(font_family[k])
self.fonts: dict[str, pymupdf.Font] = {}
self.fontid2fontpath: dict[str, Path] = {}
for font_file_name in self.font_file_names:
if font_file_name in self.fontid2fontpath:
continue
font_path, font_metadata = assets.get_font_and_metadata(font_file_name)
pymupdf_font = pymupdf.Font(fontfile=str(font_path))
pymupdf_font.has_glyph = functools.lru_cache(maxsize=10240, typed=True)(
pymupdf_font.has_glyph,
)
pymupdf_font.char_lengths = functools.lru_cache(maxsize=10240, typed=True)(
pymupdf_font.char_lengths,
)
self.fonts[font_file_name] = pymupdf_font
self.fontid2fontpath[font_file_name] = font_path
self.fonts[font_file_name].font_id = font_file_name
self.fonts[font_file_name].font_path = font_path
self.fonts[font_file_name].ascent_fontmap = font_metadata["ascent"]
self.fonts[font_file_name].descent_fontmap = font_metadata["descent"]
self.fonts[font_file_name].encoding_length = font_metadata[
"encoding_length"
]
self.normal_font_ids: list[str] = font_family["normal"]
self.script_font_ids: list[str] = font_family["script"]
self.fallback_font_ids: list[str] = font_family["fallback"]
self.base_font_ids: list[str] = font_family["base"]
self.fontid2fontpath["base"] = self.fontid2fontpath[font_family["base"][0]]
self.fontid2font: dict[str, pymupdf.Font] = {
f.font_id: f for f in self.fonts.values()
}
self.fontid2font["base"] = self.fontid2font[self.base_font_ids[0]]
self.normal_fonts: list[pymupdf.Font] = [
self.fontid2font[font_id] for font_id in self.normal_font_ids
]
self.script_fonts: list[pymupdf.Font] = [
self.fontid2font[font_id] for font_id in self.script_font_ids
]
self.fallback_fonts: list[pymupdf.Font] = [
self.fontid2font[font_id] for font_id in self.fallback_font_ids
]
self.base_font = self.fontid2font["base"]
self.type2font: dict[str, list[pymupdf.Font]] = {
"normal": self.normal_fonts,
"script": self.script_fonts,
"fallback": self.fallback_fonts,
"base": [self.base_font],
}
self.has_char = functools.lru_cache(maxsize=10240, typed=True)(self.has_char)
self.map_in_type = functools.lru_cache(maxsize=10240, typed=True)(
self.map_in_type
)
def has_char(self, char_unicode: str):
if len(char_unicode) != 1:
return False
current_char = ord(char_unicode)
for font in self.fonts.values():
if font.has_glyph(current_char):
return True
return False
def map_in_type(
self,
bold: bool,
italic: bool,
monospaced: bool,
serif: bool,
char_unicode: str,
font_type: str,
):
if font_type == "script" and not italic:
return None
current_char = ord(char_unicode)
for font in self.type2font[font_type]:
if not font.has_glyph(current_char):
continue
if bool(bold) != bool(font.is_bold):
continue
# 不知道什么原因,思源黑体的 serif 属性为 1,先 workaround
if bool(serif) and "serif" not in font.font_id.lower():
continue
if not bool(serif) and "serif" in font.font_id.lower():
continue
return font
return None
def map(self, original_font: PdfFont, char_unicode: str):
current_char = ord(char_unicode)
if isinstance(original_font, pymupdf.Font):
bold = original_font.is_bold
italic = original_font.is_italic
monospaced = original_font.is_monospaced
serif = original_font.is_serif
elif isinstance(original_font, PdfFont):
bold = original_font.bold
italic = original_font.italic
monospaced = original_font.monospace
serif = original_font.serif
else:
logger.error(
f"Unknown font type: {type(original_font)}. "
f"Original font: {original_font}. "
f"Char unicode: {char_unicode}. ",
)
return None
if self.primary_font_family == PrimaryFontFamily.SERIF:
serif = True
elif self.primary_font_family == PrimaryFontFamily.SANS_SERIF:
serif = False
elif self.primary_font_family == PrimaryFontFamily.SCRIPT:
serif = False
italic = True
script_font_map_result = self.map_in_type(
bold, italic, monospaced, serif, char_unicode, "script"
)
if script_font_map_result:
return script_font_map_result
for script_font in self.script_fonts:
if italic and script_font.has_glyph(current_char):
return script_font
normal_font_map_result = self.map_in_type(
bold, italic, monospaced, serif, char_unicode, "normal"
)
if normal_font_map_result is not None:
return normal_font_map_result
fallback_font_map_result = self.map_in_type(
bold, italic, monospaced, serif, char_unicode, "fallback"
)
if fallback_font_map_result is not None:
return fallback_font_map_result
for font in self.fallback_fonts:
if font.has_glyph(current_char):
return font
logger.warning(
f"Can't find font for {char_unicode}({current_char}). "
f"Original font: {original_font.name}[{original_font.font_id}]. "
f"Char unicode: {char_unicode}. ",
)
return None
def get_used_font_ids(self, il: il_version_1.Document) -> set[str]:
result = set()
for page in il.page:
for char in page.pdf_character:
if char.pdf_style and char.pdf_style.font_id:
result.add(char.pdf_style.font_id)
for para in page.pdf_paragraph:
for comp in para.pdf_paragraph_composition:
if char := comp.pdf_character:
if char.pdf_style and char.pdf_style.font_id:
result.add(char.pdf_style.font_id)
return result
def add_font(self, doc_zh: pymupdf.Document, il: il_version_1.Document):
used_font_ids = self.get_used_font_ids(il)
font_list = [
(k, v) for k, v in self.fontid2fontpath.items() if k in used_font_ids
]
font_id = {}
xreflen = doc_zh.xref_length()
total = xreflen - 1 + len(font_list) + len(il.page) + len(font_list)
with self.translation_config.progress_monitor.stage_start(
self.stage_name,
total,
) as pbar:
if not il.page:
pbar.advance(total)
return
for font in font_list:
if font[0] in font_id:
continue
font_id[font[0]] = doc_zh[0].insert_font(font[0], font[1])
pbar.advance(1)
for xref in range(1, xreflen):
pbar.advance(1)
# xref_type = doc_zh.xref_get_key(xref, "Type")
# if xref_type[1] == "/Page":
# resources_xref = doc_zh.xref_get_key(xref, "Resources")
# if resources_xref[0] == 'null':
# doc_zh.xref_set_key(xref, "Resources", f"<</Font<<>>>>")
for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
try: # xref 读写可能出错
font_res = doc_zh.xref_get_key(xref, f"{label}Font")
if font_res is None:
continue
target_key_prefix = f"{label}Font/"
if font_res[0] == "xref":
resource_xref_id = re.search(
"(\\d+) 0 R",
font_res[1],
).group(1)
xref = int(resource_xref_id)
font_res = ("dict", doc_zh.xref_object(xref))
target_key_prefix = ""
if font_res[0] == "dict":
for font in font_list:
target_key = f"{target_key_prefix}{font[0]}"
font_exist = doc_zh.xref_get_key(xref, target_key)
if font_exist[0] == "null":
doc_zh.xref_set_key(
xref,
target_key,
f"{font_id[font[0]]} 0 R",
)
except Exception:
pass
# Create PdfFont for each font
# 预先创建所有字体对象
pdf_fonts = []
for font_name, _ in font_list:
# Get descent_fontmap from fontid2font
assert font_name in self.fontid2font, f"Font {font_name} not found"
mupdf_font = self.fontid2font[font_name]
descent_fontmap = mupdf_font.descent_fontmap
ascent_fontmap = mupdf_font.ascent_fontmap
encoding_length = mupdf_font.encoding_length
pdf_fonts.append(
il_version_1.PdfFont(
name=font_name,
xref_id=font_id[font_name],
font_id=font_name,
encoding_length=encoding_length,
bold=mupdf_font.is_bold,
italic=mupdf_font.is_italic,
monospace=mupdf_font.is_monospaced,
serif=mupdf_font.is_serif,
descent=descent_fontmap,
ascent=ascent_fontmap,
),
)
pbar.advance(1)
# 批量添加字体到页面和 XObject
for page in il.page:
page.pdf_font.extend(pdf_fonts)
for xobj in page.pdf_xobject:
xobj.pdf_font.extend(pdf_fonts)
pbar.advance(1)