Spaces:
Runtime error
Runtime error
| from collections.abc import Sequence | |
| from dataclasses import dataclass, field | |
| from typing import Any, Union, cast | |
| from pypdf.generic import ArrayObject, DictionaryObject, NameObject | |
| from ._cmap import get_encoding | |
| from ._codecs.adobe_glyphs import adobe_glyphs | |
| from ._utils import logger_warning | |
| from .constants import FontFlags | |
| class FontDescriptor: | |
| """ | |
| Represents the FontDescriptor dictionary as defined in the PDF specification. | |
| This contains both descriptive and metric information. | |
| The defaults are derived from the mean values of the 14 core fonts, rounded | |
| to 100. | |
| """ | |
| name: str = "Unknown" | |
| family: str = "Unknown" | |
| weight: str = "Unknown" | |
| ascent: float = 700.0 | |
| descent: float = -200.0 | |
| cap_height: float = 600.0 | |
| x_height: float = 500.0 | |
| italic_angle: float = 0.0 # Non-italic | |
| flags: int = 32 # Non-serif, non-symbolic, not fixed width | |
| bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0)) | |
| class CoreFontMetrics: | |
| font_descriptor: FontDescriptor | |
| character_widths: dict[str, int] | |
| class Font: | |
| """ | |
| A font object for use during text extraction and for producing | |
| text appearance streams. | |
| Attributes: | |
| name: Font name, derived from font["/BaseFont"] | |
| character_map: The font's character map | |
| encoding: Font encoding | |
| sub_type: The font type, such as Type1, TrueType, or Type3. | |
| font_descriptor: Font metrics, including a mapping of characters to widths | |
| character_widths: A mapping of characters to widths | |
| space_width: The width of a space, or an approximation | |
| interpretable: Default True. If False, the font glyphs cannot | |
| be translated to characters, e.g. Type3 fonts that do not define | |
| a '/ToUnicode' mapping. | |
| """ | |
| name: str | |
| encoding: Union[str, dict[int, str]] | |
| character_map: dict[Any, Any] = field(default_factory=dict) | |
| sub_type: str = "Unknown" | |
| font_descriptor: FontDescriptor = field(default_factory=FontDescriptor) | |
| character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500}) | |
| space_width: Union[float, int] = 250 | |
| interpretable: bool = True | |
| def _collect_tt_t1_character_widths( | |
| pdf_font_dict: DictionaryObject, | |
| char_map: dict[Any, Any], | |
| encoding: Union[str, dict[int, str]], | |
| current_widths: dict[str, int] | |
| ) -> None: | |
| """Parses a TrueType or Type1 font's /Widths array from a font dictionary and updates character widths""" | |
| widths_array = cast(ArrayObject, pdf_font_dict["/Widths"]) | |
| first_char = pdf_font_dict.get("/FirstChar", 0) | |
| if not isinstance(encoding, str): | |
| # This means that encoding is a dict | |
| current_widths.update({ | |
| encoding.get(idx + first_char, chr(idx + first_char)): width | |
| for idx, width in enumerate(widths_array) | |
| }) | |
| return | |
| # We map the character code directly to the character | |
| # using the string encoding | |
| for idx, width in enumerate(widths_array): | |
| # Often "idx == 0" will denote the .notdef character, but we add it anyway | |
| char_code = idx + first_char # This is a raw code | |
| # Get the "raw" character or byte representation | |
| raw_char = bytes([char_code]).decode(encoding, "surrogatepass") | |
| # Translate raw_char to the REAL Unicode character using the char_map | |
| unicode_char = char_map.get(raw_char) | |
| if unicode_char: | |
| current_widths[unicode_char] = int(width) | |
| else: | |
| current_widths[raw_char] = int(width) | |
| def _collect_cid_character_widths( | |
| d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int] | |
| ) -> None: | |
| """Parses the /W array from a DescendantFont dictionary and updates character widths.""" | |
| ord_map = { | |
| ord(_target): _surrogate | |
| for _target, _surrogate in char_map.items() | |
| if isinstance(_target, str) | |
| } | |
| # /W width definitions have two valid formats which can be mixed and matched: | |
| # (1) A character start index followed by a list of widths, e.g. | |
| # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47. | |
| # (2) A character start index, a character stop index, and a width, e.g. | |
| # `45 65 500` applies width 500 to characters 45-65. | |
| skip_count = 0 | |
| _w = d_font.get("/W", []) | |
| for idx, w_entry in enumerate(_w): | |
| w_entry = w_entry.get_object() | |
| if skip_count: | |
| skip_count -= 1 | |
| continue | |
| if not isinstance(w_entry, (int, float)): | |
| # We should never get here due to skip_count above. But | |
| # sometimes we do. | |
| logger_warning(f"Expected numeric value for width, got {w_entry}. Ignoring it.", __name__) | |
| continue | |
| # check for format (1): `int [int int int int ...]` | |
| w_next_entry = _w[idx + 1].get_object() | |
| if isinstance(w_next_entry, Sequence): | |
| start_idx, width_list = w_entry, w_next_entry | |
| current_widths.update( | |
| { | |
| ord_map[_cidx]: _width | |
| for _cidx, _width in zip( | |
| range( | |
| cast(int, start_idx), | |
| cast(int, start_idx) + len(width_list), | |
| 1, | |
| ), | |
| width_list, | |
| ) | |
| if _cidx in ord_map | |
| } | |
| ) | |
| skip_count = 1 | |
| # check for format (2): `int int int` | |
| elif isinstance(w_next_entry, (int, float)) and isinstance( | |
| _w[idx + 2].get_object(), (int, float) | |
| ): | |
| start_idx, stop_idx, const_width = ( | |
| w_entry, | |
| w_next_entry, | |
| _w[idx + 2].get_object(), | |
| ) | |
| current_widths.update( | |
| { | |
| ord_map[_cidx]: const_width | |
| for _cidx in range( | |
| cast(int, start_idx), cast(int, stop_idx + 1), 1 | |
| ) | |
| if _cidx in ord_map | |
| } | |
| ) | |
| skip_count = 2 | |
| else: | |
| # This handles the case of out of bounds (reaching the end of the width definitions | |
| # while expecting more elements). | |
| logger_warning( | |
| f"Invalid font width definition. Last element: {w_entry}.", | |
| __name__ | |
| ) | |
| def _add_default_width(current_widths: dict[str, int], flags: int) -> None: | |
| if not current_widths: | |
| current_widths["default"] = 500 | |
| return | |
| if " " in current_widths and current_widths[" "] != 0: | |
| # Setting default to once or twice the space width, depending on fixed pitch | |
| if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH: | |
| current_widths["default"] = current_widths[" "] | |
| return | |
| current_widths["default"] = int(2 * current_widths[" "]) | |
| return | |
| # Use the average width of existing glyph widths | |
| valid_widths = [w for w in current_widths.values() if w > 0] | |
| current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500 | |
| def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, Any]: | |
| font_descriptor_kwargs: dict[Any, Any] = {} | |
| for source_key, target_key in [ | |
| ("/FontName", "name"), | |
| ("/FontFamily", "family"), | |
| ("/FontWeight", "weight"), | |
| ("/Ascent", "ascent"), | |
| ("/Descent", "descent"), | |
| ("/CapHeight", "cap_height"), | |
| ("/XHeight", "x_height"), | |
| ("/ItalicAngle", "italic_angle"), | |
| ("/Flags", "flags"), | |
| ("/FontBBox", "bbox") | |
| ]: | |
| if source_key in font_descriptor_obj: | |
| font_descriptor_kwargs[target_key] = font_descriptor_obj[source_key] | |
| # Handle missing bbox gracefully - PDFs may have fonts without valid bounding boxes | |
| if "bbox" in font_descriptor_kwargs: | |
| bbox_tuple = tuple(map(float, font_descriptor_kwargs["bbox"])) | |
| assert len(bbox_tuple) == 4, bbox_tuple | |
| font_descriptor_kwargs["bbox"] = bbox_tuple | |
| return font_descriptor_kwargs | |
| def from_font_resource( | |
| cls, | |
| pdf_font_dict: DictionaryObject, | |
| ) -> "Font": | |
| from pypdf._codecs.core_font_metrics import CORE_FONT_METRICS # noqa: PLC0415 | |
| # Can collect base_font, name and encoding directly from font resource | |
| name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/") | |
| sub_type = pdf_font_dict.get("/Subtype", "Unknown").removeprefix("/") | |
| encoding, character_map = get_encoding(pdf_font_dict) | |
| font_descriptor = None | |
| character_widths: dict[str, int] = {} | |
| interpretable = True | |
| # Deal with fonts by type; Type1, TrueType and certain Type3 | |
| if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"): | |
| # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be | |
| # reliably converted into character codes unless all named chars | |
| # in /CharProcs map to a standard adobe glyph. See §9.10.2 of the | |
| # PDF 1.7 standard. | |
| if sub_type == "Type3" and "/ToUnicode" not in pdf_font_dict: | |
| interpretable = all( | |
| cname in adobe_glyphs | |
| for cname in pdf_font_dict.get("/CharProcs") or [] | |
| ) | |
| if interpretable: # Save some overhead if font is not interpretable | |
| if "/Widths" in pdf_font_dict: | |
| cls._collect_tt_t1_character_widths( | |
| pdf_font_dict, character_map, encoding, character_widths | |
| ) | |
| elif name in CORE_FONT_METRICS: | |
| font_descriptor = CORE_FONT_METRICS[name].font_descriptor | |
| character_widths = CORE_FONT_METRICS[name].character_widths | |
| if "/FontDescriptor" in pdf_font_dict: | |
| font_descriptor_obj = pdf_font_dict.get("/FontDescriptor", DictionaryObject()).get_object() | |
| if "/MissingWidth" in font_descriptor_obj: | |
| character_widths["default"] = cast(int, font_descriptor_obj["/MissingWidth"].get_object()) | |
| font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj)) | |
| elif "/FontBBox" in pdf_font_dict: | |
| # For Type3 without Font Descriptor but with FontBBox, see Table 110 in the PDF specification 2.0 | |
| bbox_tuple = tuple(map(float, cast(ArrayObject, pdf_font_dict["/FontBBox"]))) | |
| assert len(bbox_tuple) == 4, bbox_tuple | |
| font_descriptor = FontDescriptor(name=name, bbox=bbox_tuple) | |
| else: | |
| # Composite font or CID font - CID fonts have a /W array mapping character codes | |
| # to widths stashed in /DescendantFonts. No need to test for /DescendantFonts though, | |
| # because all other fonts have already been dealt with. | |
| d_font: DictionaryObject | |
| for d_font_idx, d_font in enumerate( | |
| cast(ArrayObject, pdf_font_dict["/DescendantFonts"]) | |
| ): | |
| d_font = cast(DictionaryObject, d_font.get_object()) | |
| cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font | |
| cls._collect_cid_character_widths( | |
| d_font, character_map, character_widths | |
| ) | |
| if "/DW" in d_font: | |
| character_widths["default"] = cast(int, d_font["/DW"].get_object()) | |
| font_descriptor_obj = d_font.get("/FontDescriptor", DictionaryObject()).get_object() | |
| font_descriptor = FontDescriptor(**cls._parse_font_descriptor(font_descriptor_obj)) | |
| if not font_descriptor: | |
| font_descriptor = FontDescriptor(name=name) | |
| if character_widths.get("default", 0) == 0: | |
| cls._add_default_width(character_widths, font_descriptor.flags) | |
| space_width = character_widths.get(" ", 0) | |
| if space_width == 0: | |
| if (font_descriptor.flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH: | |
| space_width = character_widths["default"] | |
| else: | |
| space_width = character_widths["default"] // 2 | |
| return cls( | |
| name=name, | |
| sub_type=sub_type, | |
| encoding=encoding, | |
| font_descriptor=font_descriptor, | |
| character_map=character_map, | |
| character_widths=character_widths, | |
| space_width=space_width, | |
| interpretable=interpretable | |
| ) | |
| def as_font_resource(self) -> DictionaryObject: | |
| # For now, this returns a font resource that only works with the 14 Adobe Core fonts. | |
| return ( | |
| DictionaryObject({ | |
| NameObject("/Subtype"): NameObject("/Type1"), | |
| NameObject("/Name"): NameObject(f"/{self.name}"), | |
| NameObject("/Type"): NameObject("/Font"), | |
| NameObject("/BaseFont"): NameObject(f"/{self.name}"), | |
| NameObject("/Encoding"): NameObject("/WinAnsiEncoding") | |
| }) | |
| ) | |
| def text_width(self, text: str = "") -> float: | |
| """Sum of character widths specified in PDF font for the supplied text.""" | |
| return sum( | |
| [self.character_widths.get(char, self.character_widths["default"]) for char in text], 0.0 | |
| ) | |