Spaces:
Runtime error
Runtime error
| import binascii | |
| from binascii import Error as BinasciiError | |
| from binascii import unhexlify | |
| from math import ceil | |
| from typing import Any, Union, cast | |
| from ._codecs import adobe_glyphs, charset_encoding | |
| from ._utils import logger_error, logger_warning | |
| from .errors import LimitReachedError | |
| from .generic import ( | |
| DecodedStreamObject, | |
| DictionaryObject, | |
| NullObject, | |
| StreamObject, | |
| is_null_or_none, | |
| ) | |
| _predefined_cmap: dict[str, str] = { | |
| "/Identity-H": "utf-16-be", | |
| "/Identity-V": "utf-16-be", | |
| "/GB-EUC-H": "gbk", | |
| "/GB-EUC-V": "gbk", | |
| "/GBpc-EUC-H": "gb2312", | |
| "/GBpc-EUC-V": "gb2312", | |
| "/GBK-EUC-H": "gbk", | |
| "/GBK-EUC-V": "gbk", | |
| "/GBK2K-H": "gb18030", | |
| "/GBK2K-V": "gb18030", | |
| "/ETen-B5-H": "cp950", | |
| "/ETen-B5-V": "cp950", | |
| "/ETenms-B5-H": "cp950", | |
| "/ETenms-B5-V": "cp950", | |
| "/UniCNS-UTF16-H": "utf-16-be", | |
| "/UniCNS-UTF16-V": "utf-16-be", | |
| "/UniGB-UTF16-H": "gb18030", | |
| "/UniGB-UTF16-V": "gb18030", | |
| # UCS2 in code | |
| } | |
| def get_encoding( | |
| ft: DictionaryObject | |
| ) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]: | |
| encoding = _parse_encoding(ft) | |
| map_dict, int_entry = _parse_to_unicode(ft) | |
| # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet: | |
| # if cmap not empty encoding should be discarded | |
| # (here transformed into identity for those characters) | |
| # If encoding is a string, it is expected to be an identity translation. | |
| if isinstance(encoding, dict): | |
| for x in int_entry: | |
| if x <= 255: | |
| encoding[x] = chr(x) | |
| return encoding, map_dict | |
| def _parse_encoding( | |
| ft: DictionaryObject | |
| ) -> Union[str, dict[int, str]]: | |
| encoding: Union[str, list[str], dict[int, str]] = [] | |
| if "/Encoding" not in ft: | |
| if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding: | |
| encoding = dict( | |
| zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])]) | |
| ) | |
| else: | |
| encoding = "charmap" | |
| return encoding | |
| enc: Union[str, DictionaryObject, NullObject] = cast( | |
| Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object() | |
| ) | |
| if isinstance(enc, str): | |
| try: | |
| # already done : enc = NameObject.unnumber(enc.encode()).decode() | |
| # for #xx decoding | |
| if enc in charset_encoding: | |
| encoding = charset_encoding[enc].copy() | |
| elif enc in _predefined_cmap: | |
| encoding = _predefined_cmap[enc] | |
| elif "-UCS2-" in enc: | |
| encoding = "utf-16-be" | |
| else: | |
| raise Exception("not found") | |
| except Exception: | |
| logger_error(f"Advanced encoding {enc} not implemented yet", __name__) | |
| encoding = enc | |
| elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: | |
| try: | |
| encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() | |
| except Exception: | |
| logger_error( | |
| f"Advanced encoding {encoding} not implemented yet", | |
| __name__, | |
| ) | |
| encoding = charset_encoding["/StandardEncoding"].copy() | |
| else: | |
| encoding = charset_encoding["/StandardEncoding"].copy() | |
| if isinstance(enc, DictionaryObject) and "/Differences" in enc: | |
| x: int = 0 | |
| o: Union[int, str] | |
| for o in cast(DictionaryObject, enc["/Differences"]): | |
| if isinstance(o, int): | |
| x = o | |
| else: # isinstance(o, str): | |
| try: | |
| if x < len(encoding): | |
| encoding[x] = adobe_glyphs[o] # type: ignore | |
| except Exception: | |
| encoding[x] = o # type: ignore | |
| x += 1 | |
| if isinstance(encoding, list): | |
| encoding = dict(zip(range(256), encoding)) | |
| return encoding | |
| def _parse_to_unicode( | |
| ft: DictionaryObject | |
| ) -> tuple[dict[Any, Any], list[int]]: | |
| # will store all translation code | |
| # and map_dict[-1] we will have the number of bytes to convert | |
| map_dict: dict[Any, Any] = {} | |
| # will provide the list of cmap keys as int to correct encoding | |
| int_entry: list[int] = [] | |
| if "/ToUnicode" not in ft: | |
| if ft.get("/Subtype", "") == "/Type1": | |
| return _type1_alternative(ft, map_dict, int_entry) | |
| return {}, [] | |
| process_rg: bool = False | |
| process_char: bool = False | |
| multiline_rg: Union[ | |
| None, tuple[int, int] | |
| ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file | |
| cm = prepare_cm(ft) | |
| for line in cm.split(b"\n"): | |
| process_rg, process_char, multiline_rg = process_cm_line( | |
| line.strip(b" \t"), | |
| process_rg, | |
| process_char, | |
| multiline_rg, | |
| map_dict, | |
| int_entry, | |
| ) | |
| return map_dict, int_entry | |
| def prepare_cm(ft: DictionaryObject) -> bytes: | |
| tu = ft["/ToUnicode"] | |
| cm: bytes | |
| if isinstance(tu, StreamObject): | |
| cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() | |
| else: # if (tu is None) or cast(str, tu).startswith("/Identity"): | |
| # the full range 0000-FFFF will be processed | |
| cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" | |
| if isinstance(cm, str): | |
| cm = cm.encode() | |
| # we need to prepare cm before due to missing return line in pdf printed | |
| # to pdf from word | |
| cm = ( | |
| cm.strip() | |
| .replace(b"beginbfchar", b"\nbeginbfchar\n") | |
| .replace(b"endbfchar", b"\nendbfchar\n") | |
| .replace(b"beginbfrange", b"\nbeginbfrange\n") | |
| .replace(b"endbfrange", b"\nendbfrange\n") | |
| .replace(b"<<", b"\n{\n") # text between << and >> not used but | |
| .replace(b">>", b"\n}\n") # some solution to find it back | |
| ) | |
| ll = cm.split(b"<") | |
| for i in range(len(ll)): | |
| j = ll[i].find(b">") | |
| if j >= 0: | |
| if j == 0: | |
| # string is empty: stash a placeholder here (see below) | |
| # see https://github.com/py-pdf/pypdf/issues/1111 | |
| content = b"." | |
| else: | |
| content = ll[i][:j].replace(b" ", b"") | |
| ll[i] = content + b" " + ll[i][j + 1 :] | |
| cm = ( | |
| (b" ".join(ll)) | |
| .replace(b"[", b" [ ") | |
| .replace(b"]", b" ]\n ") | |
| .replace(b"\r", b"\n") | |
| ) | |
| return cm | |
| def process_cm_line( | |
| line: bytes, | |
| process_rg: bool, | |
| process_char: bool, | |
| multiline_rg: Union[None, tuple[int, int]], | |
| map_dict: dict[Any, Any], | |
| int_entry: list[int], | |
| ) -> tuple[bool, bool, Union[None, tuple[int, int]]]: | |
| if line == b"" or line[0] == 37: # 37 = % | |
| return process_rg, process_char, multiline_rg | |
| line = line.replace(b"\t", b" ") | |
| if b"beginbfrange" in line: | |
| process_rg = True | |
| elif b"endbfrange" in line: | |
| process_rg = False | |
| elif b"beginbfchar" in line: | |
| process_char = True | |
| elif b"endbfchar" in line: | |
| process_char = False | |
| elif process_rg: | |
| try: | |
| multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg) | |
| except binascii.Error as error: | |
| logger_warning(f"Skipping broken line {line!r}: {error}", __name__) | |
| elif process_char: | |
| parse_bfchar(line, map_dict, int_entry) | |
| return process_rg, process_char, multiline_rg | |
| # Usual values should be up to 65_536. | |
| MAPPING_DICTIONARY_SIZE_LIMIT = 100_000 | |
| def _check_mapping_size(size: int) -> None: | |
| if size > MAPPING_DICTIONARY_SIZE_LIMIT: | |
| raise LimitReachedError(f"Maximum /ToUnicode size limit reached: {size} > {MAPPING_DICTIONARY_SIZE_LIMIT}.") | |
| def parse_bfrange( | |
| line: bytes, | |
| map_dict: dict[Any, Any], | |
| int_entry: list[int], | |
| multiline_rg: Union[None, tuple[int, int]], | |
| ) -> Union[None, tuple[int, int]]: | |
| lst = [x for x in line.split(b" ") if x] | |
| closure_found = False | |
| entry_count = len(int_entry) | |
| _check_mapping_size(entry_count) | |
| if multiline_rg is not None: | |
| fmt = b"%%0%dX" % (map_dict[-1] * 2) | |
| a = multiline_rg[0] # a, b not in the current line | |
| b = multiline_rg[1] | |
| for sq in lst: | |
| if sq == b"]": | |
| closure_found = True | |
| break | |
| entry_count += 1 | |
| _check_mapping_size(entry_count) | |
| map_dict[ | |
| unhexlify(fmt % a).decode( | |
| "charmap" if map_dict[-1] == 1 else "utf-16-be", | |
| "surrogatepass", | |
| ) | |
| ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") | |
| int_entry.append(a) | |
| a += 1 | |
| else: | |
| a = int(lst[0], 16) | |
| b = int(lst[1], 16) | |
| nbi = max(len(lst[0]), len(lst[1])) | |
| map_dict[-1] = ceil(nbi / 2) | |
| fmt = b"%%0%dX" % (map_dict[-1] * 2) | |
| if lst[2] == b"[": | |
| for sq in lst[3:]: | |
| if sq == b"]": | |
| closure_found = True | |
| break | |
| entry_count += 1 | |
| _check_mapping_size(entry_count) | |
| map_dict[ | |
| unhexlify(fmt % a).decode( | |
| "charmap" if map_dict[-1] == 1 else "utf-16-be", | |
| "surrogatepass", | |
| ) | |
| ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") | |
| int_entry.append(a) | |
| a += 1 | |
| else: # case without list | |
| c = int(lst[2], 16) | |
| fmt2 = b"%%0%dX" % max(4, len(lst[2])) | |
| closure_found = True | |
| range_size = max(0, b - a + 1) | |
| _check_mapping_size(entry_count + range_size) # This can be checked beforehand. | |
| while a <= b: | |
| map_dict[ | |
| unhexlify(fmt % a).decode( | |
| "charmap" if map_dict[-1] == 1 else "utf-16-be", | |
| "surrogatepass", | |
| ) | |
| ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") | |
| int_entry.append(a) | |
| a += 1 | |
| c += 1 | |
| return None if closure_found else (a, b) | |
| def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None: | |
| lst = [x for x in line.split(b" ") if x] | |
| new_count = len(lst) // 2 | |
| _check_mapping_size(len(int_entry) + new_count) # This can be checked beforehand. | |
| map_dict[-1] = len(lst[0]) // 2 | |
| while len(lst) > 1: | |
| map_to = "" | |
| # placeholder (see above) means empty string | |
| if lst[1] != b".": | |
| try: | |
| map_to = unhexlify(lst[1]).decode( | |
| "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass" | |
| ) # join is here as some cases where the code was split | |
| except BinasciiError as exception: | |
| logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__) | |
| map_dict[ | |
| unhexlify(lst[0]).decode( | |
| "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass" | |
| ) | |
| ] = map_to | |
| int_entry.append(int(lst[0], 16)) | |
| lst = lst[2:] | |
| def _type1_alternative( | |
| ft: DictionaryObject, | |
| map_dict: dict[Any, Any], | |
| int_entry: list[int], | |
| ) -> tuple[dict[Any, Any], list[int]]: | |
| if "/FontDescriptor" not in ft: | |
| return map_dict, int_entry | |
| ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile") | |
| if is_null_or_none(ft_desc): | |
| return map_dict, int_entry | |
| assert ft_desc is not None, "mypy" | |
| txt = ft_desc.get_object().get_data() | |
| txt = txt.split(b"eexec\n")[0] # only clear part | |
| txt = txt.split(b"/Encoding")[1] # to get the encoding part | |
| lines = txt.replace(b"\r", b"\n").split(b"\n") | |
| for li in lines: | |
| if li.startswith(b"dup"): | |
| words = [_w for _w in li.split(b" ") if _w != b""] | |
| if len(words) > 3 and words[3] != b"put": | |
| continue | |
| try: | |
| i = int(words[1]) | |
| except ValueError: # pragma: no cover | |
| continue | |
| try: | |
| v = adobe_glyphs[words[2].decode()] | |
| except KeyError: | |
| if words[2].startswith(b"/uni"): | |
| try: | |
| v = chr(int(words[2][4:], 16)) | |
| except ValueError: # pragma: no cover | |
| continue | |
| else: | |
| continue | |
| map_dict[chr(i)] = v | |
| int_entry.append(i) | |
| return map_dict, int_entry | |