Spaces:
Runtime error
Runtime error
File size: 7,258 Bytes
ef60d00 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | import codecs
from typing import Union
from .._codecs import _pdfdoc_encoding
from .._utils import StreamType, logger_warning, read_non_whitespace
from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError
from ._base import ByteStringObject, TextStringObject
def hex_to_rgb(value: str) -> tuple[float, float, float]:
return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore
def read_hex_string_from_stream(
stream: StreamType,
forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
) -> Union["TextStringObject", "ByteStringObject"]:
stream.read(1)
arr = []
x = b""
while True:
tok = read_non_whitespace(stream)
if not tok:
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
if tok == b">":
break
x += tok
if len(x) == 2:
arr.append(int(x, base=16))
x = b""
if len(x) == 1:
x += b"0"
if x != b"":
arr.append(int(x, base=16))
return create_string_object(bytes(arr), forced_encoding)
__ESCAPE_DICT__ = {
b"n": ord(b"\n"),
b"r": ord(b"\r"),
b"t": ord(b"\t"),
b"b": ord(b"\b"),
b"f": ord(b"\f"),
b"(": ord(b"("),
b")": ord(b")"),
b"/": ord(b"/"),
b"\\": ord(b"\\"),
b" ": ord(b" "),
b"%": ord(b"%"),
b"<": ord(b"<"),
b">": ord(b">"),
b"[": ord(b"["),
b"]": ord(b"]"),
b"#": ord(b"#"),
b"_": ord(b"_"),
b"&": ord(b"&"),
b"$": ord(b"$"),
}
__BACKSLASH_CODE__ = 92
def read_string_from_stream(
stream: StreamType,
forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
) -> Union["TextStringObject", "ByteStringObject"]:
tok = stream.read(1)
parens = 1
txt = []
while True:
tok = stream.read(1)
if not tok:
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
if tok == b"(":
parens += 1
elif tok == b")":
parens -= 1
if parens == 0:
break
elif tok == b"\\":
tok = stream.read(1)
try:
txt.append(__ESCAPE_DICT__[tok])
continue
except KeyError:
if b"0" <= tok <= b"7":
# "The number ddd may consist of one, two, or three
# octal digits; high-order overflow shall be ignored.
# Three octal digits shall be used, with leading zeros
# as needed, if the next character of the string is also
# a digit." (PDF reference 7.3.4.2, p 16)
sav = stream.tell() - 1
for _ in range(2):
ntok = stream.read(1)
if b"0" <= ntok <= b"7":
tok += ntok
else:
stream.seek(-1, 1) # ntok has to be analyzed
break
i = int(tok, base=8)
if i > 255:
txt.append(__BACKSLASH_CODE__)
stream.seek(sav)
else:
txt.append(i)
continue
if tok in b"\n\r":
# This case is hit when a backslash followed by a line
# break occurs. If it's a multi-char EOL, consume the
# second character:
tok = stream.read(1)
if tok not in b"\n\r":
stream.seek(-1, 1)
# Then don't add anything to the actual string, since this
# line break was escaped:
continue
msg = f"Unexpected escaped string: {tok.decode('utf-8', 'ignore')}"
logger_warning(msg, __name__)
txt.append(__BACKSLASH_CODE__)
txt.append(ord(tok))
return create_string_object(bytes(txt), forced_encoding)
def create_string_object(
string: Union[str, bytes],
forced_encoding: Union[None, str, list[str], dict[int, str]] = None,
) -> Union[TextStringObject, ByteStringObject]:
"""
Create a ByteStringObject or a TextStringObject from a string to represent the string.
Args:
string: The data being used
forced_encoding: Typically None, or an encoding string
Returns:
A ByteStringObject
Raises:
TypeError: If string is not of type str or bytes.
"""
if isinstance(string, str):
return TextStringObject(string)
if isinstance(string, bytes):
if isinstance(forced_encoding, (list, dict)):
out = ""
for x in string:
try:
out += forced_encoding[x]
except Exception:
out += bytes((x,)).decode("charmap")
obj = TextStringObject(out)
obj._original_bytes = string
return obj
if isinstance(forced_encoding, str):
if forced_encoding == "bytes":
return ByteStringObject(string)
obj = TextStringObject(string.decode(forced_encoding))
obj._original_bytes = string
return obj
try:
if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
retval = TextStringObject(string.decode("utf-16"))
retval._original_bytes = string
retval.autodetect_utf16 = True
retval.utf16_bom = string[:2]
return retval
if string.startswith(b"\x00"):
retval = TextStringObject(string.decode("utf-16be"))
retval._original_bytes = string
retval.autodetect_utf16 = True
retval.utf16_bom = codecs.BOM_UTF16_BE
return retval
if string[1:2] == b"\x00":
retval = TextStringObject(string.decode("utf-16le"))
retval._original_bytes = string
retval.autodetect_utf16 = True
retval.utf16_bom = codecs.BOM_UTF16_LE
return retval
# This is probably a big performance hit here, but we need
# to convert string objects into the text/unicode-aware
# version if possible... and the only way to check if that's
# possible is to try.
# Some strings are strings, some are just byte arrays.
retval = TextStringObject(decode_pdfdocencoding(string))
retval._original_bytes = string
retval.autodetect_pdfdocencoding = True
return retval
except UnicodeDecodeError:
return ByteStringObject(string)
else:
raise TypeError("create_string_object should have str or unicode arg")
def decode_pdfdocencoding(byte_array: bytes) -> str:
retval = ""
for b in byte_array:
c = _pdfdoc_encoding[b]
if c == "\u0000":
raise UnicodeDecodeError(
"pdfdocencoding",
bytearray(b),
-1,
-1,
"does not exist in translation table",
)
retval += c
return retval
|