Spaces:

gilzero
/

editor-app-v10

Paused

App Files Files Community

editor-app-v10 / myenv /lib /python3.10 /site-packages /PyPDF2 /_cmap.py

gilzero

Upload folder using huggingface_hub

cb1a5c9 verified over 1 year ago

raw

history blame contribute delete

14.6 kB

	import warnings
	from binascii import unhexlify
	from math import ceil
	from typing import Any, Dict, List, Tuple, Union, cast

	from ._codecs import adobe_glyphs, charset_encoding
	from ._utils import logger_warning
	from .errors import PdfReadWarning
	from .generic import DecodedStreamObject, DictionaryObject, StreamObject


	# code freely inspired from @twiggy ; see #711
	def build_char_map(
	font_name: str, space_width: float, obj: DictionaryObject
	) -> Tuple[
	str, float, Union[str, Dict[int, str]], Dict, DictionaryObject
	]: # font_type,space_width /2, encoding, cmap
	"""Determine information about a font.

	This function returns a tuple consisting of:
	font sub-type, space_width/2, encoding, map character-map, font-dictionary.
	The font-dictionary itself is suitable for the curious."""
	ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore
	font_type: str = cast(str, ft["/Subtype"])

	space_code = 32
	encoding, space_code = parse_encoding(ft, space_code)
	map_dict, space_code, int_entry = parse_to_unicode(ft, space_code)

	# encoding can be either a string for decode (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me)
	# if empty string, it means it is than encoding field is not present and we have to select the good encoding from cmap input data
	if encoding == "":
	if -1 not in map_dict or map_dict[-1] == 1:
	# I have not been able to find any rule for no /Encoding nor /ToUnicode
	# One example shows /Symbol,bold I consider 8 bits encoding default
	encoding = "charmap"
	else:
	encoding = "utf-16-be"
	# apply rule from PDF ref 1.7 §5.9.1, 1st bullet : if cmap not empty encoding should be discarded (here transformed into identity for those characters)
	# if encoding is an str it is expected to be a identity translation
	elif isinstance(encoding, dict):
	for x in int_entry:
	if x <= 255:
	encoding[x] = chr(x)
	try:
	# override space_width with new params
	space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
	except Exception:
	pass
	# I conside the space_code is available on one byte
	if isinstance(space_code, str):
	try: # one byte
	sp = space_code.encode("charmap")[0]
	except Exception:
	sp = space_code.encode("utf-16-be")
	sp = sp[0] + 256 * sp[1]
	else:
	sp = space_code
	sp_width = compute_space_width(ft, sp, space_width)

	return (
	font_type,
	float(sp_width / 2),
	encoding,
	# https://github.com/python/mypy/issues/4374
	map_dict,
	ft,
	)


	# used when missing data, e.g. font def missing
	unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (
	"Unknown",
	9999,
	dict(zip(range(256), ["�"] * 256)),
	{},
	)


	_predefined_cmap: Dict[str, str] = {
	"/Identity-H": "utf-16-be",
	"/Identity-V": "utf-16-be",
	"/GB-EUC-H": "gbk", # TBC
	"/GB-EUC-V": "gbk", # TBC
	"/GBpc-EUC-H": "gb2312", # TBC
	"/GBpc-EUC-V": "gb2312", # TBC
	}


	# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
	_default_fonts_space_width: Dict[str, int] = {
	"/Courrier": 600,
	"/Courier-Bold": 600,
	"/Courier-BoldOblique": 600,
	"/Courier-Oblique": 600,
	"/Helvetica": 278,
	"/Helvetica-Bold": 278,
	"/Helvetica-BoldOblique": 278,
	"/Helvetica-Oblique": 278,
	"/Helvetica-Narrow": 228,
	"/Helvetica-NarrowBold": 228,
	"/Helvetica-NarrowBoldOblique": 228,
	"/Helvetica-NarrowOblique": 228,
	"/Times-Roman": 250,
	"/Times-Bold": 250,
	"/Times-BoldItalic": 250,
	"/Times-Italic": 250,
	"/Symbol": 250,
	"/ZapfDingbats": 278,
	}


	def parse_encoding(
	ft: DictionaryObject, space_code: int
	) -> Tuple[Union[str, Dict[int, str]], int]:
	encoding: Union[str, List[str], Dict[int, str]] = []
	if "/Encoding" not in ft:
	try:
	if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
	encoding = dict(
	zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
	)
	else:
	encoding = "charmap"
	return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])]
	except Exception:
	if cast(str, ft["/Subtype"]) == "/Type1":
	return "charmap", space_code
	else:
	return "", space_code
	enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore
	if isinstance(enc, str):
	try:
	# allready done : enc = NameObject.unnumber(enc.encode()).decode() # for #xx decoding
	if enc in charset_encoding:
	encoding = charset_encoding[enc].copy()
	elif enc in _predefined_cmap:
	encoding = _predefined_cmap[enc]
	else:
	raise Exception("not found")
	except Exception:
	warnings.warn(
	f"Advanced encoding {enc} not implemented yet",
	PdfReadWarning,
	)
	encoding = enc
	elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
	try:
	encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
	except Exception:
	warnings.warn(
	f"Advanced encoding {encoding} not implemented yet",
	PdfReadWarning,
	)
	encoding = charset_encoding["/StandardCoding"].copy()
	else:
	encoding = charset_encoding["/StandardCoding"].copy()
	if "/Differences" in enc:
	x: int = 0
	o: Union[int, str]
	for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]):
	if isinstance(o, int):
	x = o
	else: # isinstance(o,str):
	try:
	encoding[x] = adobe_glyphs[o] # type: ignore
	except Exception:
	encoding[x] = o # type: ignore
	if o == " ":
	space_code = x
	x += 1
	if isinstance(encoding, list):
	encoding = dict(zip(range(256), encoding))
	return encoding, space_code


	def parse_to_unicode(
	ft: DictionaryObject, space_code: int
	) -> Tuple[Dict[Any, Any], int, List[int]]:
	# will store all translation code
	# and map_dict[-1] we will have the number of bytes to convert
	map_dict: Dict[Any, Any] = {}

	# will provide the list of cmap keys as int to correct encoding
	int_entry: List[int] = []

	if "/ToUnicode" not in ft:
	return {}, space_code, []
	process_rg: bool = False
	process_char: bool = False
	multiline_rg: Union[
	None, Tuple[int, int]
	] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
	cm = prepare_cm(ft)
	for l in cm.split(b"\n"):
	process_rg, process_char, multiline_rg = process_cm_line(
	l.strip(b" "), process_rg, process_char, multiline_rg, map_dict, int_entry
	)

	for a, value in map_dict.items():
	if value == " ":
	space_code = a
	return map_dict, space_code, int_entry


	def prepare_cm(ft: DictionaryObject) -> bytes:
	tu = ft["/ToUnicode"]
	cm: bytes
	if isinstance(tu, StreamObject):
	cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
	elif isinstance(tu, str) and tu.startswith("/Identity"):
	cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" # the full range 0000-FFFF will be processed
	if isinstance(cm, str):
	cm = cm.encode()
	# we need to prepare cm before due to missing return line in pdf printed to pdf from word
	cm = (
	cm.strip()
	.replace(b"beginbfchar", b"\nbeginbfchar\n")
	.replace(b"endbfchar", b"\nendbfchar\n")
	.replace(b"beginbfrange", b"\nbeginbfrange\n")
	.replace(b"endbfrange", b"\nendbfrange\n")
	.replace(b"<<", b"\n{\n") # text between << and >> not used but
	.replace(b">>", b"\n}\n") # some solution to find it back
	)
	ll = cm.split(b"<")
	for i in range(len(ll)):
	j = ll[i].find(b">")
	if j >= 0:
	if j == 0:
	# string is empty: stash a placeholder here (see below)
	# see https://github.com/py-pdf/PyPDF2/issues/1111
	content = b"."
	else:
	content = ll[i][:j].replace(b" ", b"")
	ll[i] = content + b" " + ll[i][j + 1 :]
	cm = (
	(b" ".join(ll))
	.replace(b"[", b" [ ")
	.replace(b"]", b" ]\n ")
	.replace(b"\r", b"\n")
	)
	return cm


	def process_cm_line(
	l: bytes,
	process_rg: bool,
	process_char: bool,
	multiline_rg: Union[None, Tuple[int, int]],
	map_dict: Dict[Any, Any],
	int_entry: List[int],
	) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
	if l in (b"", b" ") or l[0] == 37: # 37 = %
	return process_rg, process_char, multiline_rg
	if b"beginbfrange" in l:
	process_rg = True
	elif b"endbfrange" in l:
	process_rg = False
	elif b"beginbfchar" in l:
	process_char = True
	elif b"endbfchar" in l:
	process_char = False
	elif process_rg:
	multiline_rg = parse_bfrange(l, map_dict, int_entry, multiline_rg)
	elif process_char:
	parse_bfchar(l, map_dict, int_entry)
	return process_rg, process_char, multiline_rg


	def parse_bfrange(
	l: bytes,
	map_dict: Dict[Any, Any],
	int_entry: List[int],
	multiline_rg: Union[None, Tuple[int, int]],
	) -> Union[None, Tuple[int, int]]:
	lst = [x for x in l.split(b" ") if x]
	closure_found = False
	nbi = max(len(lst[0]), len(lst[1]))
	map_dict[-1] = ceil(nbi / 2)
	fmt = b"%%0%dX" % (map_dict[-1] * 2)
	if multiline_rg is not None:
	a = multiline_rg[0] # a, b not in the current line
	b = multiline_rg[1]
	for sq in lst[1:]:
	if sq == b"]":
	closure_found = True
	break
	map_dict[
	unhexlify(fmt % a).decode(
	"charmap" if map_dict[-1] == 1 else "utf-16-be",
	"surrogatepass",
	)
	] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
	int_entry.append(a)
	a += 1
	else:
	a = int(lst[0], 16)
	b = int(lst[1], 16)
	if lst[2] == b"[":
	for sq in lst[3:]:
	if sq == b"]":
	closure_found = True
	break
	map_dict[
	unhexlify(fmt % a).decode(
	"charmap" if map_dict[-1] == 1 else "utf-16-be",
	"surrogatepass",
	)
	] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
	int_entry.append(a)
	a += 1
	else: # case without list
	c = int(lst[2], 16)
	fmt2 = b"%%0%dX" % max(4, len(lst[2]))
	closure_found = True
	while a <= b:
	map_dict[
	unhexlify(fmt % a).decode(
	"charmap" if map_dict[-1] == 1 else "utf-16-be",
	"surrogatepass",
	)
	] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
	int_entry.append(a)
	a += 1
	c += 1
	return None if closure_found else (a, b)


	def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
	lst = [x for x in l.split(b" ") if x]
	map_dict[-1] = len(lst[0]) // 2
	while len(lst) > 1:
	map_to = ""
	# placeholder (see above) means empty string
	if lst[1] != b".":
	map_to = unhexlify(lst[1]).decode(
	"charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
	) # join is here as some cases where the code was split
	map_dict[
	unhexlify(lst[0]).decode(
	"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
	)
	] = map_to
	int_entry.append(int(lst[0], 16))
	lst = lst[2:]


	def compute_space_width(
	ft: DictionaryObject, space_code: int, space_width: float
	) -> float:
	sp_width: float = space_width * 2 # default value
	w = []
	w1 = {}
	st: int = 0
	if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
	ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
	try:
	w1[-1] = cast(float, ft1["/DW"])
	except Exception:
	w1[-1] = 1000.0
	if "/W" in ft1:
	w = list(ft1["/W"])
	else:
	w = []
	while len(w) > 0:
	st = w[0]
	second = w[1]
	if isinstance(second, int):
	for x in range(st, second):
	w1[x] = w[2]
	w = w[3:]
	elif isinstance(second, list):
	for y in second:
	w1[st] = y
	st += 1
	w = w[2:]
	else:
	logger_warning(
	"unknown widths : \n" + (ft1["/W"]).__repr__(),
	__name__,
	)
	break
	try:
	sp_width = w1[space_code]
	except Exception:
	sp_width = (
	w1[-1] / 2.0
	) # if using default we consider space will be only half size
	elif "/Widths" in ft:
	w = list(ft["/Widths"]) # type: ignore
	try:
	st = cast(int, ft["/FirstChar"])
	en: int = cast(int, ft["/LastChar"])
	if st > space_code or en < space_code:
	raise Exception("Not in range")
	if w[space_code - st] == 0:
	raise Exception("null width")
	sp_width = w[space_code - st]
	except Exception:
	if "/FontDescriptor" in ft and "/MissingWidth" in cast(
	DictionaryObject, ft["/FontDescriptor"]
	):
	sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore
	else:
	# will consider width of char as avg(width)/2
	m = 0
	cpt = 0
	for x in w:
	if x > 0:
	m += x
	cpt += 1
	sp_width = m / max(1, cpt) / 2
	return sp_width