Spaces:
Paused
Paused
| import codecs | |
| from typing import Dict, List, Tuple, Union | |
| from .._codecs import _pdfdoc_encoding | |
| from .._utils import StreamType, b_, logger_warning, read_non_whitespace | |
| from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError | |
| from ._base import ByteStringObject, TextStringObject | |
| def hex_to_rgb(value: str) -> Tuple[float, float, float]: | |
| return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore | |
| def read_hex_string_from_stream( | |
| stream: StreamType, | |
| forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, | |
| ) -> Union["TextStringObject", "ByteStringObject"]: | |
| stream.read(1) | |
| txt = "" | |
| x = b"" | |
| while True: | |
| tok = read_non_whitespace(stream) | |
| if not tok: | |
| raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) | |
| if tok == b">": | |
| break | |
| x += tok | |
| if len(x) == 2: | |
| txt += chr(int(x, base=16)) | |
| x = b"" | |
| if len(x) == 1: | |
| x += b"0" | |
| if len(x) == 2: | |
| txt += chr(int(x, base=16)) | |
| return create_string_object(b_(txt), forced_encoding) | |
| def read_string_from_stream( | |
| stream: StreamType, | |
| forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, | |
| ) -> Union["TextStringObject", "ByteStringObject"]: | |
| tok = stream.read(1) | |
| parens = 1 | |
| txt = [] | |
| while True: | |
| tok = stream.read(1) | |
| if not tok: | |
| raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) | |
| if tok == b"(": | |
| parens += 1 | |
| elif tok == b")": | |
| parens -= 1 | |
| if parens == 0: | |
| break | |
| elif tok == b"\\": | |
| tok = stream.read(1) | |
| escape_dict = { | |
| b"n": b"\n", | |
| b"r": b"\r", | |
| b"t": b"\t", | |
| b"b": b"\b", | |
| b"f": b"\f", | |
| b"c": rb"\c", | |
| b"(": b"(", | |
| b")": b")", | |
| b"/": b"/", | |
| b"\\": b"\\", | |
| b" ": b" ", | |
| b"%": b"%", | |
| b"<": b"<", | |
| b">": b">", | |
| b"[": b"[", | |
| b"]": b"]", | |
| b"#": b"#", | |
| b"_": b"_", | |
| b"&": b"&", | |
| b"$": b"$", | |
| } | |
| try: | |
| tok = escape_dict[tok] | |
| except KeyError: | |
| if b"0" <= tok and tok <= b"7": | |
| # "The number ddd may consist of one, two, or three | |
| # octal digits; high-order overflow shall be ignored. | |
| # Three octal digits shall be used, with leading zeros | |
| # as needed, if the next character of the string is also | |
| # a digit." (PDF reference 7.3.4.2, p 16) | |
| for _ in range(2): | |
| ntok = stream.read(1) | |
| if b"0" <= ntok and ntok <= b"7": | |
| tok += ntok | |
| else: | |
| stream.seek(-1, 1) # ntok has to be analysed | |
| break | |
| tok = b_(chr(int(tok, base=8))) | |
| elif tok in b"\n\r": | |
| # This case is hit when a backslash followed by a line | |
| # break occurs. If it's a multi-char EOL, consume the | |
| # second character: | |
| tok = stream.read(1) | |
| if tok not in b"\n\r": | |
| stream.seek(-1, 1) | |
| # Then don't add anything to the actual string, since this | |
| # line break was escaped: | |
| tok = b"" | |
| else: | |
| msg = rf"Unexpected escaped string: {tok.decode('utf8')}" | |
| logger_warning(msg, __name__) | |
| txt.append(tok) | |
| return create_string_object(b"".join(txt), forced_encoding) | |
| def create_string_object( | |
| string: Union[str, bytes], | |
| forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, | |
| ) -> Union[TextStringObject, ByteStringObject]: | |
| """ | |
| Create a ByteStringObject or a TextStringObject from a string to represent the string. | |
| :param Union[str, bytes] string: A string | |
| :raises TypeError: If string is not of type str or bytes. | |
| """ | |
| if isinstance(string, str): | |
| return TextStringObject(string) | |
| elif isinstance(string, bytes): | |
| if isinstance(forced_encoding, (list, dict)): | |
| out = "" | |
| for x in string: | |
| try: | |
| out += forced_encoding[x] | |
| except Exception: | |
| out += bytes((x,)).decode("charmap") | |
| return TextStringObject(out) | |
| elif isinstance(forced_encoding, str): | |
| if forced_encoding == "bytes": | |
| return ByteStringObject(string) | |
| return TextStringObject(string.decode(forced_encoding)) | |
| else: | |
| try: | |
| if string.startswith(codecs.BOM_UTF16_BE): | |
| retval = TextStringObject(string.decode("utf-16")) | |
| retval.autodetect_utf16 = True | |
| return retval | |
| else: | |
| # This is probably a big performance hit here, but we need to | |
| # convert string objects into the text/unicode-aware version if | |
| # possible... and the only way to check if that's possible is | |
| # to try. Some strings are strings, some are just byte arrays. | |
| retval = TextStringObject(decode_pdfdocencoding(string)) | |
| retval.autodetect_pdfdocencoding = True | |
| return retval | |
| except UnicodeDecodeError: | |
| return ByteStringObject(string) | |
| else: | |
| raise TypeError("create_string_object should have str or unicode arg") | |
| def decode_pdfdocencoding(byte_array: bytes) -> str: | |
| retval = "" | |
| for b in byte_array: | |
| c = _pdfdoc_encoding[b] | |
| if c == "\u0000": | |
| raise UnicodeDecodeError( | |
| "pdfdocencoding", | |
| bytearray(b), | |
| -1, | |
| -1, | |
| "does not exist in translation table", | |
| ) | |
| retval += c | |
| return retval | |