Spaces:
Paused
Paused
| # Copyright (c) 2006, Mathieu Fenniak | |
| # Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> | |
| # | |
| # All rights reserved. | |
| # | |
| # Redistribution and use in source and binary forms, with or without | |
| # modification, are permitted provided that the following conditions are | |
| # met: | |
| # | |
| # * Redistributions of source code must retain the above copyright notice, | |
| # this list of conditions and the following disclaimer. | |
| # * Redistributions in binary form must reproduce the above copyright notice, | |
| # this list of conditions and the following disclaimer in the documentation | |
| # and/or other materials provided with the distribution. | |
| # * The name of the author may not be used to endorse or promote products | |
| # derived from this software without specific prior written permission. | |
| # | |
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
| # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
| # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
| # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
| # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
| # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
| # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
| # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
| # POSSIBILITY OF SUCH DAMAGE. | |
| import math | |
| import uuid | |
| import warnings | |
| from decimal import Decimal | |
| from typing import ( | |
| Any, | |
| Callable, | |
| Dict, | |
| Iterable, | |
| Iterator, | |
| List, | |
| Optional, | |
| Set, | |
| Tuple, | |
| Union, | |
| cast, | |
| ) | |
| from ._cmap import build_char_map, unknown_char_map | |
| from ._protocols import PdfReaderProtocol | |
| from ._utils import ( | |
| CompressedTransformationMatrix, | |
| File, | |
| TransformationMatrixType, | |
| deprecation_no_replacement, | |
| deprecation_with_replacement, | |
| logger_warning, | |
| matrix_multiply, | |
| ) | |
| from .constants import AnnotationDictionaryAttributes as ADA | |
| from .constants import ImageAttributes as IA | |
| from .constants import PageAttributes as PG | |
| from .constants import Ressources as RES | |
| from .errors import PageSizeNotDefinedError | |
| from .filters import _xobj_to_image | |
| from .generic import ( | |
| ArrayObject, | |
| ContentStream, | |
| DictionaryObject, | |
| EncodedStreamObject, | |
| FloatObject, | |
| IndirectObject, | |
| NameObject, | |
| NullObject, | |
| NumberObject, | |
| RectangleObject, | |
| encode_pdfdocencoding, | |
| ) | |
| CUSTOM_RTL_MIN: int = -1 | |
| CUSTOM_RTL_MAX: int = -1 | |
| CUSTOM_RTL_SPECIAL_CHARS: List[int] = [] | |
| def set_custom_rtl( | |
| _min: Union[str, int, None] = None, | |
| _max: Union[str, int, None] = None, | |
| specials: Union[str, List[int], None] = None, | |
| ) -> Tuple[int, int, List[int]]: | |
| """ | |
| Change the Right-To-Left and special characters custom parameters. | |
| Args: | |
| _min: The new minimum value for the range of custom characters that | |
| will be written right to left. | |
| If set to `None`, the value will not be changed. | |
| If set to an integer or string, it will be converted to its ASCII code. | |
| The default value is -1, which sets no additional range to be converted. | |
| _max: The new maximum value for the range of custom characters that will be written right to left. | |
| If set to `None`, the value will not be changed. | |
| If set to an integer or string, it will be converted to its ASCII code. | |
| The default value is -1, which sets no additional range to be converted. | |
| specials: The new list of special characters to be inserted in the current insertion order. | |
| If set to `None`, the current value will not be changed. | |
| If set to a string, it will be converted to a list of ASCII codes. | |
| The default value is an empty list. | |
| Returns: | |
| A tuple containing the new values for `CUSTOM_RTL_MIN`, `CUSTOM_RTL_MAX`, and `CUSTOM_RTL_SPECIAL_CHARS`. | |
| """ | |
| global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS | |
| if isinstance(_min, int): | |
| CUSTOM_RTL_MIN = _min | |
| elif isinstance(_min, str): | |
| CUSTOM_RTL_MIN = ord(_min) | |
| if isinstance(_max, int): | |
| CUSTOM_RTL_MAX = _max | |
| elif isinstance(_max, str): | |
| CUSTOM_RTL_MAX = ord(_max) | |
| if isinstance(specials, str): | |
| CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials] | |
| elif isinstance(specials, list): | |
| CUSTOM_RTL_SPECIAL_CHARS = specials | |
| return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS | |
| def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: | |
| retval: Union[None, RectangleObject, IndirectObject] = self.get(name) | |
| if isinstance(retval, RectangleObject): | |
| return retval | |
| if retval is None: | |
| for d in defaults: | |
| retval = self.get(d) | |
| if retval is not None: | |
| break | |
| if isinstance(retval, IndirectObject): | |
| retval = self.pdf.get_object(retval) | |
| retval = RectangleObject(retval) # type: ignore | |
| _set_rectangle(self, name, retval) | |
| return retval | |
| def getRectangle( | |
| self: Any, name: str, defaults: Iterable[str] | |
| ) -> RectangleObject: # pragma: no cover | |
| deprecation_no_replacement("getRectangle", "3.0.0") | |
| return _get_rectangle(self, name, defaults) | |
| def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: | |
| name = NameObject(name) | |
| self[name] = value | |
| def setRectangle( | |
| self: Any, name: str, value: Union[RectangleObject, float] | |
| ) -> None: # pragma: no cover | |
| deprecation_no_replacement("setRectangle", "3.0.0") | |
| _set_rectangle(self, name, value) | |
| def _delete_rectangle(self: Any, name: str) -> None: | |
| del self[name] | |
| def deleteRectangle(self: Any, name: str) -> None: # pragma: no cover | |
| deprecation_no_replacement("deleteRectangle", "3.0.0") | |
| del self[name] | |
| def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: | |
| return property( | |
| lambda self: _get_rectangle(self, name, fallback), | |
| lambda self, value: _set_rectangle(self, name, value), | |
| lambda self: _delete_rectangle(self, name), | |
| ) | |
| def createRectangleAccessor( | |
| name: str, fallback: Iterable[str] | |
| ) -> property: # pragma: no cover | |
| deprecation_no_replacement("createRectangleAccessor", "3.0.0") | |
| return _create_rectangle_accessor(name, fallback) | |
| class Transformation: | |
| """ | |
| Represent a 2D transformation. | |
| The transformation between two coordinate systems is represented by a 3-by-3 | |
| transformation matrix matrix with the following form:: | |
| a b 0 | |
| c d 0 | |
| e f 1 | |
| Because a transformation matrix has only six elements that can be changed, | |
| it is usually specified in PDF as the six-element array [ a b c d e f ]. | |
| Coordinate transformations are expressed as matrix multiplications:: | |
| a b 0 | |
| [ x′ y′ 1 ] = [ x y 1 ] × c d 0 | |
| e f 1 | |
| Example | |
| ------- | |
| >>> from PyPDF2 import Transformation | |
| >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) | |
| >>> page.add_transformation(op) | |
| """ | |
| # 9.5.4 Coordinate Systems for 3D | |
| # 4.2.2 Common Transformations | |
| def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)): | |
| self.ctm = ctm | |
| def matrix(self) -> TransformationMatrixType: | |
| """ | |
| Return the transformation matrix as a tuple of tuples in the form: | |
| ((a, b, 0), (c, d, 0), (e, f, 1)) | |
| """ | |
| return ( | |
| (self.ctm[0], self.ctm[1], 0), | |
| (self.ctm[2], self.ctm[3], 0), | |
| (self.ctm[4], self.ctm[5], 1), | |
| ) | |
| def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: | |
| """ | |
| Compresses the transformation matrix into a tuple of (a, b, c, d, e, f). | |
| Args: | |
| matrix: The transformation matrix as a tuple of tuples. | |
| Returns: | |
| A tuple representing the transformation matrix as (a, b, c, d, e, f) | |
| """ | |
| return ( | |
| matrix[0][0], | |
| matrix[0][1], | |
| matrix[1][0], | |
| matrix[1][1], | |
| matrix[2][0], | |
| matrix[2][1], | |
| ) | |
| def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": | |
| """ | |
| Translate the contents of a page. | |
| Args: | |
| tx: The translation along the x-axis. | |
| ty: The translation along the y-axis. | |
| Returns: | |
| A new `Transformation` instance | |
| """ | |
| m = self.ctm | |
| return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) | |
| def scale( | |
| self, sx: Optional[float] = None, sy: Optional[float] = None | |
| ) -> "Transformation": | |
| """ | |
| Scale the contents of a page towards the origin of the coordinate system. | |
| Typically, that is the lower-left corner of the page. That can be | |
| changed by translating the contents / the page boxes. | |
| Args: | |
| sx: The scale factor along the x-axis. | |
| sy: The scale factor along the y-axis. | |
| Returns: | |
| A new Transformation instance with the scaled matrix. | |
| """ | |
| if sx is None and sy is None: | |
| raise ValueError("Either sx or sy must be specified") | |
| if sx is None: | |
| sx = sy | |
| if sy is None: | |
| sy = sx | |
| assert sx is not None | |
| assert sy is not None | |
| op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) | |
| ctm = Transformation.compress(matrix_multiply(self.matrix, op)) | |
| return Transformation(ctm) | |
| def rotate(self, rotation: float) -> "Transformation": | |
| """ | |
| Rotate the contents of a page. | |
| Args: | |
| rotation: The angle of rotation in degrees. | |
| Returns: | |
| A new `Transformation` instance with the rotated matrix. | |
| """ | |
| rotation = math.radians(rotation) | |
| op: TransformationMatrixType = ( | |
| (math.cos(rotation), math.sin(rotation), 0), | |
| (-math.sin(rotation), math.cos(rotation), 0), | |
| (0, 0, 1), | |
| ) | |
| ctm = Transformation.compress(matrix_multiply(self.matrix, op)) | |
| return Transformation(ctm) | |
| def __repr__(self) -> str: | |
| return f"Transformation(ctm={self.ctm})" | |
| def apply_on( | |
| self, pt: Union[Tuple[Decimal, Decimal], Tuple[float, float], List[float]] | |
| ) -> Union[Tuple[float, float], List[float]]: | |
| """ | |
| Apply the transformation matrix on the given point. | |
| Args: | |
| pt: A tuple or list representing the point in the form (x, y) | |
| Returns: | |
| A tuple or list representing the transformed point in the form (x', y') | |
| """ | |
| pt1 = ( | |
| float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4], | |
| float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5], | |
| ) | |
| return list(pt1) if isinstance(pt, list) else pt1 | |
| class PageObject(DictionaryObject): | |
| """ | |
| PageObject represents a single page within a PDF file. | |
| Typically this object will be created by accessing the | |
| :meth:`get_page()<PyPDF2.PdfReader.get_page>` method of the | |
| :class:`PdfReader<PyPDF2.PdfReader>` class, but it is | |
| also possible to create an empty page with the | |
| :meth:`create_blank_page()<PyPDF2._page.PageObject.create_blank_page>` static method. | |
| Args: | |
| pdf: PDF file the page belongs to. | |
| indirect_reference: Stores the original indirect reference to | |
| this object in its source PDF | |
| """ | |
| original_page: "PageObject" # very local use in writer when appending | |
| def __init__( | |
| self, | |
| pdf: Optional[PdfReaderProtocol] = None, | |
| indirect_reference: Optional[IndirectObject] = None, | |
| indirect_ref: Optional[IndirectObject] = None, # deprecated | |
| ) -> None: | |
| DictionaryObject.__init__(self) | |
| self.pdf: Optional[PdfReaderProtocol] = pdf | |
| if indirect_ref is not None: # deprecated | |
| warnings.warn( | |
| ( | |
| "indirect_ref is deprecated and will be removed in " | |
| "PyPDF2 4.0.0. Use indirect_reference instead of indirect_ref." | |
| ), | |
| DeprecationWarning, | |
| ) | |
| if indirect_reference is not None: | |
| raise ValueError("Use indirect_reference instead of indirect_ref.") | |
| indirect_reference = indirect_ref | |
| self.indirect_reference = indirect_reference | |
| def indirect_ref(self) -> Optional[IndirectObject]: # deprecated | |
| warnings.warn( | |
| ( | |
| "indirect_ref is deprecated and will be removed in PyPDF2 4.0.0" | |
| "Use indirect_reference instead of indirect_ref." | |
| ), | |
| DeprecationWarning, | |
| ) | |
| return self.indirect_reference | |
| def indirect_ref(self, value: Optional[IndirectObject]) -> None: # deprecated | |
| self.indirect_reference = value | |
| def hash_value_data(self) -> bytes: | |
| data = super().hash_value_data() | |
| data += b"%d" % id(self) | |
| return data | |
| def user_unit(self) -> float: | |
| """ | |
| A read-only positive number giving the size of user space units. | |
| It is in multiples of 1/72 inch. Hence a value of 1 means a user space | |
| unit is 1/72 inch, and a value of 3 means that a user space unit is | |
| 3/72 inch. | |
| """ | |
| return self.get(PG.USER_UNIT, 1) | |
| def create_blank_page( | |
| pdf: Optional[Any] = None, # PdfReader | |
| width: Union[float, Decimal, None] = None, | |
| height: Union[float, Decimal, None] = None, | |
| ) -> "PageObject": | |
| """ | |
| Return a new blank page. | |
| If ``width`` or ``height`` is ``None``, try to get the page size | |
| from the last page of *pdf*. | |
| Args: | |
| pdf: PDF file the page belongs to | |
| width: The width of the new page expressed in default user | |
| space units. | |
| height: The height of the new page expressed in default user | |
| space units. | |
| Returns: | |
| The new blank page | |
| Raises: | |
| PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains | |
| no page | |
| """ | |
| page = PageObject(pdf) | |
| # Creates a new page (cf PDF Reference 7.7.3.3) | |
| page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) | |
| page.__setitem__(NameObject(PG.PARENT), NullObject()) | |
| page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) | |
| if width is None or height is None: | |
| if pdf is not None and len(pdf.pages) > 0: | |
| lastpage = pdf.pages[len(pdf.pages) - 1] | |
| width = lastpage.mediabox.width | |
| height = lastpage.mediabox.height | |
| else: | |
| raise PageSizeNotDefinedError | |
| page.__setitem__( | |
| NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore | |
| ) | |
| return page | |
| def createBlankPage( | |
| pdf: Optional[Any] = None, # PdfReader | |
| width: Union[float, Decimal, None] = None, | |
| height: Union[float, Decimal, None] = None, | |
| ) -> "PageObject": # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`create_blank_page` instead. | |
| """ | |
| deprecation_with_replacement("createBlankPage", "create_blank_page", "3.0.0") | |
| return PageObject.create_blank_page(pdf, width, height) | |
| def images(self) -> List[File]: | |
| """ | |
| Get a list of all images of the page. | |
| This requires pillow. You can install it via 'pip install PyPDF2[image]'. | |
| For the moment, this does NOT include inline images. They will be added | |
| in future. | |
| """ | |
| images_extracted: List[File] = [] | |
| if RES.XOBJECT not in self[PG.RESOURCES]: # type: ignore | |
| return images_extracted | |
| x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore | |
| for obj in x_object: | |
| if x_object[obj][IA.SUBTYPE] == "/Image": | |
| extension, byte_stream = _xobj_to_image(x_object[obj]) | |
| if extension is not None: | |
| filename = f"{obj[1:]}{extension}" | |
| images_extracted.append(File(name=filename, data=byte_stream)) | |
| return images_extracted | |
| def rotation(self) -> int: | |
| """ | |
| The VISUAL rotation of the page. | |
| This number has to be a multiple of 90 degrees: 0,90,180,270 | |
| This property does not affect "/Contents" | |
| """ | |
| return int(self.get(PG.ROTATE, 0)) | |
| def rotation(self, r: Union[int, float]) -> None: | |
| self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360) | |
| def transfer_rotation_to_content(self) -> None: | |
| """ | |
| Apply the rotation of the page to the content and the media/crop/... boxes. | |
| It's recommended to apply this function before page merging. | |
| """ | |
| r = -self.rotation # rotation to apply is in the otherway | |
| self.rotation = 0 | |
| mb = RectangleObject(self.mediabox) | |
| trsf = ( | |
| Transformation() | |
| .translate( | |
| -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2) | |
| ) | |
| .rotate(r) | |
| ) | |
| pt1 = trsf.apply_on(mb.lower_left) | |
| pt2 = trsf.apply_on(mb.upper_right) | |
| trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1])) | |
| self.add_transformation(trsf, False) | |
| for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]: | |
| if b in self: | |
| rr = RectangleObject(self[b]) # type: ignore | |
| pt1 = trsf.apply_on(rr.lower_left) | |
| pt2 = trsf.apply_on(rr.upper_right) | |
| self[NameObject(b)] = RectangleObject( | |
| ( | |
| min(pt1[0], pt2[0]), | |
| min(pt1[1], pt2[1]), | |
| max(pt1[0], pt2[0]), | |
| max(pt1[1], pt2[1]), | |
| ) | |
| ) | |
| def rotate(self, angle: int) -> "PageObject": | |
| """ | |
| Rotate a page clockwise by increments of 90 degrees. | |
| Args: | |
| angle: Angle to rotate the page. Must be an increment of 90 deg. | |
| """ | |
| if angle % 90 != 0: | |
| raise ValueError("Rotation angle must be a multiple of 90") | |
| rotate_obj = self.get(PG.ROTATE, 0) | |
| current_angle = ( | |
| rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() | |
| ) | |
| self[NameObject(PG.ROTATE)] = NumberObject(current_angle + angle) | |
| return self | |
| def rotate_clockwise(self, angle: int) -> "PageObject": # pragma: no cover | |
| deprecation_with_replacement("rotate_clockwise", "rotate", "3.0.0") | |
| return self.rotate(angle) | |
| def rotateClockwise(self, angle: int) -> "PageObject": # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`rotate_clockwise` instead. | |
| """ | |
| deprecation_with_replacement("rotateClockwise", "rotate", "3.0.0") | |
| return self.rotate(angle) | |
| def rotateCounterClockwise(self, angle: int) -> "PageObject": # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`rotate_clockwise` with a negative argument instead. | |
| """ | |
| deprecation_with_replacement("rotateCounterClockwise", "rotate", "3.0.0") | |
| return self.rotate(-angle) | |
| def _merge_resources( | |
| res1: DictionaryObject, res2: DictionaryObject, resource: Any | |
| ) -> Tuple[Dict[str, Any], Dict[str, Any]]: | |
| new_res = DictionaryObject() | |
| new_res.update(res1.get(resource, DictionaryObject()).get_object()) | |
| page2res = cast( | |
| DictionaryObject, res2.get(resource, DictionaryObject()).get_object() | |
| ) | |
| rename_res = {} | |
| for key in list(page2res.keys()): | |
| if key in new_res and new_res.raw_get(key) != page2res.raw_get(key): | |
| newname = NameObject(key + str(uuid.uuid4())) | |
| rename_res[key] = newname | |
| new_res[newname] = page2res[key] | |
| elif key not in new_res: | |
| new_res[key] = page2res.raw_get(key) | |
| return new_res, rename_res | |
| def _content_stream_rename( | |
| stream: ContentStream, rename: Dict[Any, Any], pdf: Any # PdfReader | |
| ) -> ContentStream: | |
| if not rename: | |
| return stream | |
| stream = ContentStream(stream, pdf) | |
| for operands, _operator in stream.operations: | |
| if isinstance(operands, list): | |
| for i in range(len(operands)): | |
| op = operands[i] | |
| if isinstance(op, NameObject): | |
| operands[i] = rename.get(op, op) | |
| elif isinstance(operands, dict): | |
| for i in operands: | |
| op = operands[i] | |
| if isinstance(op, NameObject): | |
| operands[i] = rename.get(op, op) | |
| else: | |
| raise KeyError(f"type of operands is {type(operands)}") | |
| return stream | |
| def _push_pop_gs(contents: Any, pdf: Any) -> ContentStream: # PdfReader | |
| # adds a graphics state "push" and "pop" to the beginning and end | |
| # of a content stream. This isolates it from changes such as | |
| # transformation matricies. | |
| stream = ContentStream(contents, pdf) | |
| stream.operations.insert(0, ([], "q")) | |
| stream.operations.append(([], "Q")) | |
| return stream | |
| def _add_transformation_matrix( | |
| contents: Any, pdf: Any, ctm: CompressedTransformationMatrix | |
| ) -> ContentStream: # PdfReader | |
| # adds transformation matrix at the beginning of the given | |
| # contents stream. | |
| a, b, c, d, e, f = ctm | |
| contents = ContentStream(contents, pdf) | |
| contents.operations.insert( | |
| 0, | |
| [ | |
| [ | |
| FloatObject(a), | |
| FloatObject(b), | |
| FloatObject(c), | |
| FloatObject(d), | |
| FloatObject(e), | |
| FloatObject(f), | |
| ], | |
| " cm", | |
| ], | |
| ) | |
| return contents | |
| def get_contents(self) -> Optional[ContentStream]: | |
| """ | |
| Access the page contents. | |
| :return: the ``/Contents`` object, or ``None`` if it doesn't exist. | |
| ``/Contents`` is optional, as described in PDF Reference 7.7.3.3 | |
| """ | |
| if PG.CONTENTS in self: | |
| return self[PG.CONTENTS].get_object() # type: ignore | |
| else: | |
| return None | |
| def getContents(self) -> Optional[ContentStream]: # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`get_contents` instead. | |
| """ | |
| deprecation_with_replacement("getContents", "get_contents", "3.0.0") | |
| return self.get_contents() | |
| def merge_page(self, page2: "PageObject", expand: bool = False) -> None: | |
| """ | |
| Merge the content streams of two pages into one. | |
| Resource references | |
| (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc | |
| of this page are not altered. The parameter page's content stream will | |
| be added to the end of this page's content stream, meaning that it will | |
| be drawn after, or "on top" of this page. | |
| Args: | |
| page2: The page to be merged into this one. Should be | |
| an instance of :class:`PageObject<PageObject>`. | |
| expand: If true, the current page dimensions will be | |
| expanded to accommodate the dimensions of the page to be merged. | |
| """ | |
| self._merge_page(page2, expand=expand) | |
| def mergePage(self, page2: "PageObject") -> None: # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`merge_page` instead. | |
| """ | |
| deprecation_with_replacement("mergePage", "merge_page", "3.0.0") | |
| return self.merge_page(page2) | |
| def _merge_page( | |
| self, | |
| page2: "PageObject", | |
| page2transformation: Optional[Callable[[Any], ContentStream]] = None, | |
| ctm: Optional[CompressedTransformationMatrix] = None, | |
| expand: bool = False, | |
| ) -> None: | |
| # First we work on merging the resource dictionaries. This allows us | |
| # to find out what symbols in the content streams we might need to | |
| # rename. | |
| new_resources = DictionaryObject() | |
| rename = {} | |
| try: | |
| original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) | |
| except KeyError: | |
| original_resources = DictionaryObject() | |
| try: | |
| page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) | |
| except KeyError: | |
| page2resources = DictionaryObject() | |
| new_annots = ArrayObject() | |
| for page in (self, page2): | |
| if PG.ANNOTS in page: | |
| annots = page[PG.ANNOTS] | |
| if isinstance(annots, ArrayObject): | |
| for ref in annots: | |
| new_annots.append(ref) | |
| for res in ( | |
| RES.EXT_G_STATE, | |
| RES.FONT, | |
| RES.XOBJECT, | |
| RES.COLOR_SPACE, | |
| RES.PATTERN, | |
| RES.SHADING, | |
| RES.PROPERTIES, | |
| ): | |
| new, newrename = PageObject._merge_resources( | |
| original_resources, page2resources, res | |
| ) | |
| if new: | |
| new_resources[NameObject(res)] = new | |
| rename.update(newrename) | |
| # Combine /ProcSet sets. | |
| new_resources[NameObject(RES.PROC_SET)] = ArrayObject( | |
| frozenset( | |
| original_resources.get(RES.PROC_SET, ArrayObject()).get_object() | |
| ).union( | |
| frozenset(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) | |
| ) | |
| ) | |
| new_content_array = ArrayObject() | |
| original_content = self.get_contents() | |
| if original_content is not None: | |
| new_content_array.append( | |
| PageObject._push_pop_gs(original_content, self.pdf) | |
| ) | |
| page2content = page2.get_contents() | |
| if page2content is not None: | |
| page2content = ContentStream(page2content, self.pdf) | |
| rect = page2.trimbox | |
| page2content.operations.insert( | |
| 0, | |
| ( | |
| map( | |
| FloatObject, | |
| [ | |
| rect.left, | |
| rect.bottom, | |
| rect.width, | |
| rect.height, | |
| ], | |
| ), | |
| "re", | |
| ), | |
| ) | |
| page2content.operations.insert(1, ([], "W")) | |
| page2content.operations.insert(2, ([], "n")) | |
| if page2transformation is not None: | |
| page2content = page2transformation(page2content) | |
| page2content = PageObject._content_stream_rename( | |
| page2content, rename, self.pdf | |
| ) | |
| page2content = PageObject._push_pop_gs(page2content, self.pdf) | |
| new_content_array.append(page2content) | |
| # if expanding the page to fit a new page, calculate the new media box size | |
| if expand: | |
| self._expand_mediabox(page2, ctm) | |
| self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, self.pdf) | |
| self[NameObject(PG.RESOURCES)] = new_resources | |
| self[NameObject(PG.ANNOTS)] = new_annots | |
| def _expand_mediabox( | |
| self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] | |
| ) -> None: | |
| corners1 = ( | |
| self.mediabox.left.as_numeric(), | |
| self.mediabox.bottom.as_numeric(), | |
| self.mediabox.right.as_numeric(), | |
| self.mediabox.top.as_numeric(), | |
| ) | |
| corners2 = ( | |
| page2.mediabox.left.as_numeric(), | |
| page2.mediabox.bottom.as_numeric(), | |
| page2.mediabox.left.as_numeric(), | |
| page2.mediabox.top.as_numeric(), | |
| page2.mediabox.right.as_numeric(), | |
| page2.mediabox.top.as_numeric(), | |
| page2.mediabox.right.as_numeric(), | |
| page2.mediabox.bottom.as_numeric(), | |
| ) | |
| if ctm is not None: | |
| ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] | |
| new_x = tuple( | |
| ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] | |
| for i in range(0, 8, 2) | |
| ) | |
| new_y = tuple( | |
| ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] | |
| for i in range(0, 8, 2) | |
| ) | |
| else: | |
| new_x = corners2[0:8:2] | |
| new_y = corners2[1:8:2] | |
| lowerleft = (min(new_x), min(new_y)) | |
| upperright = (max(new_x), max(new_y)) | |
| lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) | |
| upperright = ( | |
| max(corners1[2], upperright[0]), | |
| max(corners1[3], upperright[1]), | |
| ) | |
| self.mediabox.lower_left = lowerleft | |
| self.mediabox.upper_right = upperright | |
| def mergeTransformedPage( | |
| self, | |
| page2: "PageObject", | |
| ctm: Union[CompressedTransformationMatrix, Transformation], | |
| expand: bool = False, | |
| ) -> None: # pragma: no cover | |
| """ | |
| mergeTransformedPage is similar to merge_page, but a transformation | |
| matrix is applied to the merged stream. | |
| :param PageObject page2: The page to be merged into this one. Should be | |
| an instance of :class:`PageObject<PageObject>`. | |
| :param tuple ctm: a 6-element tuple containing the operands of the | |
| transformation matrix | |
| :param bool expand: Whether the page should be expanded to fit the dimensions | |
| of the page to be merged. | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
| """ | |
| deprecation_with_replacement( | |
| "page.mergeTransformedPage(page2, ctm)", | |
| "page2.add_transformation(ctm); page.merge_page(page2)", | |
| "3.0.0", | |
| ) | |
| if isinstance(ctm, Transformation): | |
| ctm = ctm.ctm | |
| ctm = cast(CompressedTransformationMatrix, ctm) | |
| self._merge_page( | |
| page2, | |
| lambda page2Content: PageObject._add_transformation_matrix( | |
| page2Content, page2.pdf, ctm # type: ignore[arg-type] | |
| ), | |
| ctm, | |
| expand, | |
| ) | |
| def mergeScaledPage( | |
| self, page2: "PageObject", scale: float, expand: bool = False | |
| ) -> None: # pragma: no cover | |
| """ | |
| mergeScaledPage is similar to merge_page, but the stream to be merged | |
| is scaled by applying a transformation matrix. | |
| :param PageObject page2: The page to be merged into this one. Should be | |
| an instance of :class:`PageObject<PageObject>`. | |
| :param float scale: The scaling factor | |
| :param bool expand: Whether the page should be expanded to fit the | |
| dimensions of the page to be merged. | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
| """ | |
| deprecation_with_replacement( | |
| "page.mergeScaledPage(page2, scale, expand)", | |
| "page2.add_transformation(Transformation().scale(scale)); page.merge_page(page2, expand)", | |
| "3.0.0", | |
| ) | |
| op = Transformation().scale(scale, scale) | |
| self.mergeTransformedPage(page2, op, expand) | |
| def mergeRotatedPage( | |
| self, page2: "PageObject", rotation: float, expand: bool = False | |
| ) -> None: # pragma: no cover | |
| """ | |
| mergeRotatedPage is similar to merge_page, but the stream to be merged | |
| is rotated by applying a transformation matrix. | |
| :param PageObject page2: the page to be merged into this one. Should be | |
| an instance of :class:`PageObject<PageObject>`. | |
| :param float rotation: The angle of the rotation, in degrees | |
| :param bool expand: Whether the page should be expanded to fit the | |
| dimensions of the page to be merged. | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
| """ | |
| deprecation_with_replacement( | |
| "page.mergeRotatedPage(page2, rotation, expand)", | |
| "page2.add_transformation(Transformation().rotate(rotation)); page.merge_page(page2, expand)", | |
| "3.0.0", | |
| ) | |
| op = Transformation().rotate(rotation) | |
| self.mergeTransformedPage(page2, op, expand) | |
| def mergeTranslatedPage( | |
| self, page2: "PageObject", tx: float, ty: float, expand: bool = False | |
| ) -> None: # pragma: no cover | |
| """ | |
| mergeTranslatedPage is similar to merge_page, but the stream to be | |
| merged is translated by applying a transformation matrix. | |
| :param PageObject page2: the page to be merged into this one. Should be | |
| an instance of :class:`PageObject<PageObject>`. | |
| :param float tx: The translation on X axis | |
| :param float ty: The translation on Y axis | |
| :param bool expand: Whether the page should be expanded to fit the | |
| dimensions of the page to be merged. | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
| """ | |
| deprecation_with_replacement( | |
| "page.mergeTranslatedPage(page2, tx, ty, expand)", | |
| "page2.add_transformation(Transformation().translate(tx, ty)); page.merge_page(page2, expand)", | |
| "3.0.0", | |
| ) | |
| op = Transformation().translate(tx, ty) | |
| self.mergeTransformedPage(page2, op, expand) | |
| def mergeRotatedTranslatedPage( | |
| self, | |
| page2: "PageObject", | |
| rotation: float, | |
| tx: float, | |
| ty: float, | |
| expand: bool = False, | |
| ) -> None: # pragma: no cover | |
| """ | |
| mergeRotatedTranslatedPage is similar to merge_page, but the stream to | |
| be merged is rotated and translated by applying a transformation matrix. | |
| :param PageObject page2: the page to be merged into this one. Should be | |
| an instance of :class:`PageObject<PageObject>`. | |
| :param float tx: The translation on X axis | |
| :param float ty: The translation on Y axis | |
| :param float rotation: The angle of the rotation, in degrees | |
| :param bool expand: Whether the page should be expanded to fit the | |
| dimensions of the page to be merged. | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
| """ | |
| deprecation_with_replacement( | |
| "page.mergeRotatedTranslatedPage(page2, rotation, tx, ty, expand)", | |
| "page2.add_transformation(Transformation().rotate(rotation).translate(tx, ty)); page.merge_page(page2, expand)", | |
| "3.0.0", | |
| ) | |
| op = Transformation().translate(-tx, -ty).rotate(rotation).translate(tx, ty) | |
| return self.mergeTransformedPage(page2, op, expand) | |
| def mergeRotatedScaledPage( | |
| self, page2: "PageObject", rotation: float, scale: float, expand: bool = False | |
| ) -> None: # pragma: no cover | |
| """ | |
| mergeRotatedScaledPage is similar to merge_page, but the stream to be | |
| merged is rotated and scaled by applying a transformation matrix. | |
| :param PageObject page2: the page to be merged into this one. Should be | |
| an instance of :class:`PageObject<PageObject>`. | |
| :param float rotation: The angle of the rotation, in degrees | |
| :param float scale: The scaling factor | |
| :param bool expand: Whether the page should be expanded to fit the | |
| dimensions of the page to be merged. | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
| """ | |
| deprecation_with_replacement( | |
| "page.mergeRotatedScaledPage(page2, rotation, scale, expand)", | |
| "page2.add_transformation(Transformation().rotate(rotation).scale(scale)); page.merge_page(page2, expand)", | |
| "3.0.0", | |
| ) | |
| op = Transformation().rotate(rotation).scale(scale, scale) | |
| self.mergeTransformedPage(page2, op, expand) | |
| def mergeScaledTranslatedPage( | |
| self, | |
| page2: "PageObject", | |
| scale: float, | |
| tx: float, | |
| ty: float, | |
| expand: bool = False, | |
| ) -> None: # pragma: no cover | |
| """ | |
| mergeScaledTranslatedPage is similar to merge_page, but the stream to be | |
| merged is translated and scaled by applying a transformation matrix. | |
| :param PageObject page2: the page to be merged into this one. Should be | |
| an instance of :class:`PageObject<PageObject>`. | |
| :param float scale: The scaling factor | |
| :param float tx: The translation on X axis | |
| :param float ty: The translation on Y axis | |
| :param bool expand: Whether the page should be expanded to fit the | |
| dimensions of the page to be merged. | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
| """ | |
| deprecation_with_replacement( | |
| "page.mergeScaledTranslatedPage(page2, scale, tx, ty, expand)", | |
| "page2.add_transformation(Transformation().scale(scale).translate(tx, ty)); page.merge_page(page2, expand)", | |
| "3.0.0", | |
| ) | |
| op = Transformation().scale(scale, scale).translate(tx, ty) | |
| return self.mergeTransformedPage(page2, op, expand) | |
| def mergeRotatedScaledTranslatedPage( | |
| self, | |
| page2: "PageObject", | |
| rotation: float, | |
| scale: float, | |
| tx: float, | |
| ty: float, | |
| expand: bool = False, | |
| ) -> None: # pragma: no cover | |
| """ | |
| mergeRotatedScaledTranslatedPage is similar to merge_page, but the | |
| stream to be merged is translated, rotated and scaled by applying a | |
| transformation matrix. | |
| :param PageObject page2: the page to be merged into this one. Should be | |
| an instance of :class:`PageObject<PageObject>`. | |
| :param float tx: The translation on X axis | |
| :param float ty: The translation on Y axis | |
| :param float rotation: The angle of the rotation, in degrees | |
| :param float scale: The scaling factor | |
| :param bool expand: Whether the page should be expanded to fit the | |
| dimensions of the page to be merged. | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`add_transformation` and :meth:`merge_page` instead. | |
| """ | |
| deprecation_with_replacement( | |
| "page.mergeRotatedScaledTranslatedPage(page2, rotation, tx, ty, expand)", | |
| "page2.add_transformation(Transformation().rotate(rotation).scale(scale)); page.merge_page(page2, expand)", | |
| "3.0.0", | |
| ) | |
| op = Transformation().rotate(rotation).scale(scale, scale).translate(tx, ty) | |
| self.mergeTransformedPage(page2, op, expand) | |
| def add_transformation( | |
| self, | |
| ctm: Union[Transformation, CompressedTransformationMatrix], | |
| expand: bool = False, | |
| ) -> None: | |
| """ | |
| Apply a transformation matrix to the page. | |
| Args: | |
| ctm: A 6-element tuple containing the operands of the | |
| transformation matrix. Alternatively, a | |
| :py:class:`Transformation<PyPDF2.Transformation>` | |
| object can be passed. | |
| See :doc:`/user/cropping-and-transforming`. | |
| """ | |
| if isinstance(ctm, Transformation): | |
| ctm = ctm.ctm | |
| content = self.get_contents() | |
| if content is not None: | |
| content = PageObject._add_transformation_matrix(content, self.pdf, ctm) | |
| content = PageObject._push_pop_gs(content, self.pdf) | |
| self[NameObject(PG.CONTENTS)] = content | |
| # if expanding the page to fit a new page, calculate the new media box size | |
| if expand: | |
| corners = [ | |
| self.mediabox.left.as_numeric(), | |
| self.mediabox.bottom.as_numeric(), | |
| self.mediabox.left.as_numeric(), | |
| self.mediabox.top.as_numeric(), | |
| self.mediabox.right.as_numeric(), | |
| self.mediabox.top.as_numeric(), | |
| self.mediabox.right.as_numeric(), | |
| self.mediabox.bottom.as_numeric(), | |
| ] | |
| ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] | |
| new_x = [ | |
| ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] | |
| for i in range(0, 8, 2) | |
| ] | |
| new_y = [ | |
| ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] | |
| for i in range(0, 8, 2) | |
| ] | |
| lowerleft = (min(new_x), min(new_y)) | |
| upperright = (max(new_x), max(new_y)) | |
| lowerleft = (min(corners[0], lowerleft[0]), min(corners[1], lowerleft[1])) | |
| upperright = ( | |
| max(corners[2], upperright[0]), | |
| max(corners[3], upperright[1]), | |
| ) | |
| self.mediabox.lower_left = lowerleft | |
| self.mediabox.upper_right = upperright | |
| def addTransformation( | |
| self, ctm: CompressedTransformationMatrix | |
| ) -> None: # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`add_transformation` instead. | |
| """ | |
| deprecation_with_replacement("addTransformation", "add_transformation", "3.0.0") | |
| self.add_transformation(ctm) | |
| def scale(self, sx: float, sy: float) -> None: | |
| """ | |
| Scale a page by the given factors by applying a transformation | |
| matrix to its content and updating the page size. | |
| This updates the mediabox, the cropbox, and the contents | |
| of the page. | |
| Args: | |
| sx: The scaling factor on horizontal axis. | |
| sy: The scaling factor on vertical axis. | |
| """ | |
| self.add_transformation((sx, 0, 0, sy, 0, 0)) | |
| self.cropbox = self.cropbox.scale(sx, sy) | |
| self.artbox = self.artbox.scale(sx, sy) | |
| self.bleedbox = self.bleedbox.scale(sx, sy) | |
| self.trimbox = self.trimbox.scale(sx, sy) | |
| self.mediabox = self.mediabox.scale(sx, sy) | |
| if PG.ANNOTS in self: | |
| annotations = self[PG.ANNOTS] | |
| if isinstance(annotations, ArrayObject): | |
| for annotation in annotations: | |
| annotation_obj = annotation.get_object() | |
| if ADA.Rect in annotation_obj: | |
| rectangle = annotation_obj[ADA.Rect] | |
| if isinstance(rectangle, ArrayObject): | |
| rectangle[0] = FloatObject(float(rectangle[0]) * sx) | |
| rectangle[1] = FloatObject(float(rectangle[1]) * sy) | |
| rectangle[2] = FloatObject(float(rectangle[2]) * sx) | |
| rectangle[3] = FloatObject(float(rectangle[3]) * sy) | |
| if PG.VP in self: | |
| viewport = self[PG.VP] | |
| if isinstance(viewport, ArrayObject): | |
| bbox = viewport[0]["/BBox"] | |
| else: | |
| bbox = viewport["/BBox"] # type: ignore | |
| scaled_bbox = RectangleObject( | |
| ( | |
| float(bbox[0]) * sx, | |
| float(bbox[1]) * sy, | |
| float(bbox[2]) * sx, | |
| float(bbox[3]) * sy, | |
| ) | |
| ) | |
| if isinstance(viewport, ArrayObject): | |
| self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore | |
| NameObject("/BBox") | |
| ] = scaled_bbox | |
| else: | |
| self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore | |
| def scale_by(self, factor: float) -> None: | |
| """ | |
| Scale a page by the given factor by applying a transformation | |
| matrix to its content and updating the page size. | |
| Args: | |
| factor: The scaling factor (for both X and Y axis). | |
| """ | |
| self.scale(factor, factor) | |
| def scaleBy(self, factor: float) -> None: # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`scale_by` instead. | |
| """ | |
| deprecation_with_replacement("scaleBy", "scale_by", "3.0.0") | |
| self.scale(factor, factor) | |
| def scale_to(self, width: float, height: float) -> None: | |
| """ | |
| Scale a page to the specified dimensions by applying a | |
| transformation matrix to its content and updating the page size. | |
| Args: | |
| width: The new width. | |
| height: The new height. | |
| """ | |
| sx = width / float(self.mediabox.width) | |
| sy = height / float(self.mediabox.height) | |
| self.scale(sx, sy) | |
| def scaleTo(self, width: float, height: float) -> None: # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`scale_to` instead. | |
| """ | |
| deprecation_with_replacement("scaleTo", "scale_to", "3.0.0") | |
| self.scale_to(width, height) | |
| def compress_content_streams(self) -> None: | |
| """ | |
| Compress the size of this page by joining all content streams and | |
| applying a FlateDecode filter. | |
| However, it is possible that this function will perform no action if | |
| content stream compression becomes "automatic". | |
| """ | |
| content = self.get_contents() | |
| if content is not None: | |
| if not isinstance(content, ContentStream): | |
| content = ContentStream(content, self.pdf) | |
| self[NameObject(PG.CONTENTS)] = content.flate_encode() | |
| def compressContentStreams(self) -> None: # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`compress_content_streams` instead. | |
| """ | |
| deprecation_with_replacement( | |
| "compressContentStreams", "compress_content_streams", "3.0.0" | |
| ) | |
| self.compress_content_streams() | |
| def _debug_for_extract(self) -> str: # pragma: no cover | |
| out = "" | |
| for ope, op in ContentStream( | |
| self["/Contents"].get_object(), self.pdf, "bytes" | |
| ).operations: | |
| if op == b"TJ": | |
| s = [x for x in ope[0] if isinstance(x, str)] | |
| else: | |
| s = [] | |
| out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" | |
| out += "\n=============================\n" | |
| try: | |
| for fo in self[PG.RESOURCES]["/Font"]: # type:ignore | |
| out += fo + "\n" | |
| out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore | |
| try: | |
| enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore | |
| "/Encoding" | |
| ].__repr__() | |
| out += enc_repr + "\n" | |
| except Exception: | |
| pass | |
| try: | |
| out += ( | |
| self[PG.RESOURCES]["/Font"][fo][ # type:ignore | |
| "/ToUnicode" | |
| ] | |
| .get_data() | |
| .decode() | |
| + "\n" | |
| ) | |
| except Exception: | |
| pass | |
| except KeyError: | |
| out += "No Font\n" | |
| return out | |
| def _extract_text( | |
| self, | |
| obj: Any, | |
| pdf: Any, | |
| orientations: Tuple[int, ...] = (0, 90, 180, 270), | |
| space_width: float = 200.0, | |
| content_key: Optional[str] = PG.CONTENTS, | |
| visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, | |
| visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, | |
| visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, | |
| ) -> str: | |
| """ | |
| See extract_text for most arguments. | |
| Args: | |
| content_key: indicate the default key where to extract data | |
| None = the object; this allow to reuse the function on XObject | |
| default = "/Content" | |
| """ | |
| text: str = "" | |
| output: str = "" | |
| rtl_dir: bool = False # right-to-left | |
| cmaps: Dict[ | |
| str, | |
| Tuple[ | |
| str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject | |
| ], | |
| ] = {} | |
| try: | |
| objr = obj | |
| while NameObject(PG.RESOURCES) not in objr: | |
| # /Resources can be inherited sometimes so we look to parents | |
| objr = objr["/Parent"].get_object() | |
| # if no parents we will have no /Resources will be available => an exception wil be raised | |
| resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) | |
| except Exception: | |
| return "" # no resources means no text is possible (no font) we consider the file as not damaged, no need to check for TJ or Tj | |
| if "/Font" in resources_dict: | |
| for f in cast(DictionaryObject, resources_dict["/Font"]): | |
| cmaps[f] = build_char_map(f, space_width, obj) | |
| cmap: Tuple[ | |
| Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] | |
| ] = ( | |
| "charmap", | |
| {}, | |
| "NotInitialized", | |
| None, | |
| ) # (encoding,CMAP,font resource name,dictionary-object of font) | |
| try: | |
| content = ( | |
| obj[content_key].get_object() if isinstance(content_key, str) else obj | |
| ) | |
| if not isinstance(content, ContentStream): | |
| content = ContentStream(content, pdf, "bytes") | |
| except KeyError: # it means no content can be extracted(certainly empty page) | |
| return "" | |
| # Note: we check all strings are TextStringObjects. ByteStringObjects | |
| # are strings where the byte->string encoding was unknown, so adding | |
| # them to the text here would be gibberish. | |
| cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] | |
| cm_stack = [] | |
| tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] | |
| tm_prev: List[float] = [ | |
| 1.0, | |
| 0.0, | |
| 0.0, | |
| 1.0, | |
| 0.0, | |
| 0.0, | |
| ] # will store cm_matrix * tm_matrix | |
| char_scale = 1.0 | |
| space_scale = 1.0 | |
| _space_width: float = 500.0 # will be set correctly at first Tf | |
| TL = 0.0 | |
| font_size = 12.0 # init just in case of | |
| def mult(m: List[float], n: List[float]) -> List[float]: | |
| return [ | |
| m[0] * n[0] + m[1] * n[2], | |
| m[0] * n[1] + m[1] * n[3], | |
| m[2] * n[0] + m[3] * n[2], | |
| m[2] * n[1] + m[3] * n[3], | |
| m[4] * n[0] + m[5] * n[2] + n[4], | |
| m[4] * n[1] + m[5] * n[3] + n[5], | |
| ] | |
| def orient(m: List[float]) -> int: | |
| if m[3] > 1e-6: | |
| return 0 | |
| elif m[3] < -1e-6: | |
| return 180 | |
| elif m[1] > 0: | |
| return 90 | |
| else: | |
| return 270 | |
| def current_spacewidth() -> float: | |
| # return space_scale * _space_width * char_scale | |
| return _space_width / 1000.0 | |
| def process_operation(operator: bytes, operands: List) -> None: | |
| nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap, orientations, rtl_dir, visitor_text | |
| global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS | |
| check_crlf_space: bool = False | |
| # Table 5.4 page 405 | |
| if operator == b"BT": | |
| tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] | |
| # tm_prev = tm_matrix | |
| output += text | |
| if visitor_text is not None: | |
| visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
| # based | |
| # if output != "" and output[-1]!="\n": | |
| # output += "\n" | |
| text = "" | |
| return None | |
| elif operator == b"ET": | |
| output += text | |
| if visitor_text is not None: | |
| visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
| text = "" | |
| # table 4.7 "Graphics state operators", page 219 | |
| # cm_matrix calculation is a reserved for the moment | |
| elif operator == b"q": | |
| cm_stack.append( | |
| ( | |
| cm_matrix, | |
| cmap, | |
| font_size, | |
| char_scale, | |
| space_scale, | |
| _space_width, | |
| TL, | |
| ) | |
| ) | |
| elif operator == b"Q": | |
| try: | |
| ( | |
| cm_matrix, | |
| cmap, | |
| font_size, | |
| char_scale, | |
| space_scale, | |
| _space_width, | |
| TL, | |
| ) = cm_stack.pop() | |
| except Exception: | |
| cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] | |
| # rtl_dir = False | |
| elif operator == b"cm": | |
| output += text | |
| if visitor_text is not None: | |
| visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
| text = "" | |
| cm_matrix = mult( | |
| [ | |
| float(operands[0]), | |
| float(operands[1]), | |
| float(operands[2]), | |
| float(operands[3]), | |
| float(operands[4]), | |
| float(operands[5]), | |
| ], | |
| cm_matrix, | |
| ) | |
| # rtl_dir = False | |
| # Table 5.2 page 398 | |
| elif operator == b"Tz": | |
| char_scale = float(operands[0]) / 100.0 | |
| elif operator == b"Tw": | |
| space_scale = 1.0 + float(operands[0]) | |
| elif operator == b"TL": | |
| TL = float(operands[0]) | |
| elif operator == b"Tf": | |
| if text != "": | |
| output += text # .translate(cmap) | |
| if visitor_text is not None: | |
| visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
| text = "" | |
| # rtl_dir = False | |
| try: | |
| # charMapTuple: font_type, float(sp_width / 2), encoding, map_dict, font-dictionary | |
| charMapTuple = cmaps[operands[0]] | |
| _space_width = charMapTuple[1] | |
| # current cmap: encoding, map_dict, font resource name (internal name, not the real font-name), | |
| # font-dictionary. The font-dictionary describes the font. | |
| cmap = ( | |
| charMapTuple[2], | |
| charMapTuple[3], | |
| operands[0], | |
| charMapTuple[4], | |
| ) | |
| except KeyError: # font not found | |
| _space_width = unknown_char_map[1] | |
| cmap = ( | |
| unknown_char_map[2], | |
| unknown_char_map[3], | |
| "???" + operands[0], | |
| None, | |
| ) | |
| try: | |
| font_size = float(operands[1]) | |
| except Exception: | |
| pass # keep previous size | |
| # Table 5.5 page 406 | |
| elif operator == b"Td": | |
| check_crlf_space = True | |
| # A special case is a translating only tm: | |
| # tm[0..5] = 1 0 0 1 e f, | |
| # i.e. tm[4] += tx, tm[5] += ty. | |
| tx = float(operands[0]) | |
| ty = float(operands[1]) | |
| tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2] | |
| tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3] | |
| elif operator == b"Tm": | |
| check_crlf_space = True | |
| tm_matrix = [ | |
| float(operands[0]), | |
| float(operands[1]), | |
| float(operands[2]), | |
| float(operands[3]), | |
| float(operands[4]), | |
| float(operands[5]), | |
| ] | |
| elif operator == b"T*": | |
| check_crlf_space = True | |
| tm_matrix[5] -= TL | |
| elif operator == b"Tj": | |
| check_crlf_space = True | |
| m = mult(tm_matrix, cm_matrix) | |
| orientation = orient(m) | |
| if orientation in orientations: | |
| if isinstance(operands[0], str): | |
| text += operands[0] | |
| else: | |
| t: str = "" | |
| tt: bytes = ( | |
| encode_pdfdocencoding(operands[0]) | |
| if isinstance(operands[0], str) | |
| else operands[0] | |
| ) | |
| if isinstance(cmap[0], str): | |
| try: | |
| t = tt.decode( | |
| cmap[0], "surrogatepass" | |
| ) # apply str encoding | |
| except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good | |
| t = tt.decode( | |
| "utf-16-be" if cmap[0] == "charmap" else "charmap", | |
| "surrogatepass", | |
| ) # apply str encoding | |
| else: # apply dict encoding | |
| t = "".join( | |
| [ | |
| cmap[0][x] if x in cmap[0] else bytes((x,)).decode() | |
| for x in tt | |
| ] | |
| ) | |
| # "\u0590 - \u08FF \uFB50 - \uFDFF" | |
| for x in "".join( | |
| [cmap[1][x] if x in cmap[1] else x for x in t] | |
| ): | |
| xx = ord(x) | |
| # fmt: off | |
| if ( # cases where the current inserting order is kept (punctuation,...) | |
| (xx <= 0x2F) # punctuations but... | |
| or (0x3A <= xx and xx <= 0x40) # numbers (x30-39) | |
| or (0x2000 <= xx and xx <= 0x206F) # upper punctuations.. | |
| or (0x20A0 <= xx and xx <= 0x21FF) # but (numbers) indices/exponents | |
| or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... | |
| ): | |
| text = x + text if rtl_dir else text + x | |
| elif ( # right-to-left characters set | |
| (0x0590 <= xx and xx <= 0x08FF) | |
| or (0xFB1D <= xx and xx <= 0xFDFF) | |
| or (0xFE70 <= xx and xx <= 0xFEFF) | |
| or (CUSTOM_RTL_MIN <= xx and xx <= CUSTOM_RTL_MAX) | |
| ): | |
| # print("<",xx,x) | |
| if not rtl_dir: | |
| rtl_dir = True | |
| # print("RTL",text,"*") | |
| output += text | |
| if visitor_text is not None: | |
| visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
| text = "" | |
| text = x + text | |
| else: # left-to-right | |
| # print(">",xx,x,end="") | |
| if rtl_dir: | |
| rtl_dir = False | |
| # print("LTR",text,"*") | |
| output += text | |
| if visitor_text is not None: | |
| visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
| text = "" | |
| text = text + x | |
| # fmt: on | |
| else: | |
| return None | |
| if check_crlf_space: | |
| m = mult(tm_matrix, cm_matrix) | |
| orientation = orient(m) | |
| delta_x = m[4] - tm_prev[4] | |
| delta_y = m[5] - tm_prev[5] | |
| k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2])) | |
| f = font_size * k | |
| tm_prev = m | |
| if orientation not in orientations: | |
| return None | |
| try: | |
| if orientation == 0: | |
| if delta_y < -0.8 * f: | |
| if (output + text)[-1] != "\n": | |
| output += text + "\n" | |
| if visitor_text is not None: | |
| visitor_text( | |
| text + "\n", | |
| cm_matrix, | |
| tm_matrix, | |
| cmap[3], | |
| font_size, | |
| ) | |
| text = "" | |
| elif ( | |
| abs(delta_y) < f * 0.3 | |
| and abs(delta_x) > current_spacewidth() * f * 15 | |
| ): | |
| if (output + text)[-1] != " ": | |
| text += " " | |
| elif orientation == 180: | |
| if delta_y > 0.8 * f: | |
| if (output + text)[-1] != "\n": | |
| output += text + "\n" | |
| if visitor_text is not None: | |
| visitor_text( | |
| text + "\n", | |
| cm_matrix, | |
| tm_matrix, | |
| cmap[3], | |
| font_size, | |
| ) | |
| text = "" | |
| elif ( | |
| abs(delta_y) < f * 0.3 | |
| and abs(delta_x) > current_spacewidth() * f * 15 | |
| ): | |
| if (output + text)[-1] != " ": | |
| text += " " | |
| elif orientation == 90: | |
| if delta_x > 0.8 * f: | |
| if (output + text)[-1] != "\n": | |
| output += text + "\n" | |
| if visitor_text is not None: | |
| visitor_text( | |
| text + "\n", | |
| cm_matrix, | |
| tm_matrix, | |
| cmap[3], | |
| font_size, | |
| ) | |
| text = "" | |
| elif ( | |
| abs(delta_x) < f * 0.3 | |
| and abs(delta_y) > current_spacewidth() * f * 15 | |
| ): | |
| if (output + text)[-1] != " ": | |
| text += " " | |
| elif orientation == 270: | |
| if delta_x < -0.8 * f: | |
| if (output + text)[-1] != "\n": | |
| output += text + "\n" | |
| if visitor_text is not None: | |
| visitor_text( | |
| text + "\n", | |
| cm_matrix, | |
| tm_matrix, | |
| cmap[3], | |
| font_size, | |
| ) | |
| text = "" | |
| elif ( | |
| abs(delta_x) < f * 0.3 | |
| and abs(delta_y) > current_spacewidth() * f * 15 | |
| ): | |
| if (output + text)[-1] != " ": | |
| text += " " | |
| except Exception: | |
| pass | |
| for operands, operator in content.operations: | |
| if visitor_operand_before is not None: | |
| visitor_operand_before(operator, operands, cm_matrix, tm_matrix) | |
| # multiple operators are defined in here #### | |
| if operator == b"'": | |
| process_operation(b"T*", []) | |
| process_operation(b"Tj", operands) | |
| elif operator == b'"': | |
| process_operation(b"Tw", [operands[0]]) | |
| process_operation(b"Tc", [operands[1]]) | |
| process_operation(b"T*", []) | |
| process_operation(b"Tj", operands[2:]) | |
| elif operator == b"TD": | |
| process_operation(b"TL", [-operands[1]]) | |
| process_operation(b"Td", operands) | |
| elif operator == b"TJ": | |
| for op in operands[0]: | |
| if isinstance(op, (str, bytes)): | |
| process_operation(b"Tj", [op]) | |
| if isinstance(op, (int, float, NumberObject, FloatObject)): | |
| if ( | |
| (abs(float(op)) >= _space_width) | |
| and (len(text) > 0) | |
| and (text[-1] != " ") | |
| ): | |
| process_operation(b"Tj", [" "]) | |
| elif operator == b"Do": | |
| output += text | |
| if visitor_text is not None: | |
| visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
| try: | |
| if output[-1] != "\n": | |
| output += "\n" | |
| if visitor_text is not None: | |
| visitor_text("\n", cm_matrix, tm_matrix, cmap[3], font_size) | |
| except IndexError: | |
| pass | |
| try: | |
| xobj = resources_dict["/XObject"] | |
| if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore | |
| # output += text | |
| text = self.extract_xform_text( | |
| xobj[operands[0]], # type: ignore | |
| orientations, | |
| space_width, | |
| visitor_operand_before, | |
| visitor_operand_after, | |
| visitor_text, | |
| ) | |
| output += text | |
| if visitor_text is not None: | |
| visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
| except Exception: | |
| logger_warning( | |
| f" impossible to decode XFormObject {operands[0]}", | |
| __name__, | |
| ) | |
| finally: | |
| text = "" | |
| else: | |
| process_operation(operator, operands) | |
| if visitor_operand_after is not None: | |
| visitor_operand_after(operator, operands, cm_matrix, tm_matrix) | |
| output += text # just in case of | |
| if text != "" and visitor_text is not None: | |
| visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) | |
| return output | |
| def extract_text( | |
| self, | |
| *args: Any, | |
| Tj_sep: str = None, | |
| TJ_sep: str = None, | |
| orientations: Union[int, Tuple[int, ...]] = (0, 90, 180, 270), | |
| space_width: float = 200.0, | |
| visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, | |
| visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, | |
| visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, | |
| ) -> str: | |
| """ | |
| Locate all text drawing commands, in the order they are provided in the | |
| content stream, and extract the text. | |
| This works well for some PDF files, but poorly for others, depending on | |
| the generator used. This will be refined in the future. | |
| Do not rely on the order of text coming out of this function, as it | |
| will change if this function is made more sophisticated. | |
| Arabic, Hebrew,... are extracted in the good order. | |
| If required an custom RTL range of characters can be defined; see function set_custom_rtl | |
| Additionally you can provide visitor-methods to get informed on all operands and all text-objects. | |
| For example in some PDF files this can be useful to parse tables. | |
| Args: | |
| Tj_sep: Deprecated. Kept for compatibility until PyPDF2 4.0.0 | |
| TJ_sep: Deprecated. Kept for compatibility until PyPDF2 4.0.0 | |
| orientations: list of orientations text_extraction will look for | |
| default = (0, 90, 180, 270) | |
| note: currently only 0(Up),90(turned Left), 180(upside Down), | |
| 270 (turned Right) | |
| space_width: force default space width | |
| if not extracted from font (default: 200) | |
| visitor_operand_before: function to be called before processing an operand. | |
| It has four arguments: operand, operand-arguments, | |
| current transformation matrix and text matrix. | |
| visitor_operand_after: function to be called after processing an operand. | |
| It has four arguments: operand, operand-arguments, | |
| current transformation matrix and text matrix. | |
| visitor_text: function to be called when extracting some text at some position. | |
| It has five arguments: text, current transformation matrix, | |
| text matrix, font-dictionary and font-size. | |
| The font-dictionary may be None in case of unknown fonts. | |
| If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". | |
| Returns: | |
| The extracted text | |
| """ | |
| if len(args) >= 1: | |
| if isinstance(args[0], str): | |
| Tj_sep = args[0] | |
| if len(args) >= 2: | |
| if isinstance(args[1], str): | |
| TJ_sep = args[1] | |
| else: | |
| raise TypeError(f"Invalid positional parameter {args[1]}") | |
| if len(args) >= 3: | |
| if isinstance(args[2], (tuple, int)): | |
| orientations = args[2] | |
| else: | |
| raise TypeError(f"Invalid positional parameter {args[2]}") | |
| if len(args) >= 4: | |
| if isinstance(args[3], (float, int)): | |
| space_width = args[3] | |
| else: | |
| raise TypeError(f"Invalid positional parameter {args[3]}") | |
| elif isinstance(args[0], (tuple, int)): | |
| orientations = args[0] | |
| if len(args) >= 2: | |
| if isinstance(args[1], (float, int)): | |
| space_width = args[1] | |
| else: | |
| raise TypeError(f"Invalid positional parameter {args[1]}") | |
| else: | |
| raise TypeError(f"Invalid positional parameter {args[0]}") | |
| if Tj_sep is not None or TJ_sep is not None: | |
| warnings.warn( | |
| "parameters Tj_Sep, TJ_sep depreciated, and will be removed in PyPDF2 4.0.0.", | |
| DeprecationWarning, | |
| ) | |
| if isinstance(orientations, int): | |
| orientations = (orientations,) | |
| return self._extract_text( | |
| self, | |
| self.pdf, | |
| orientations, | |
| space_width, | |
| PG.CONTENTS, | |
| visitor_operand_before, | |
| visitor_operand_after, | |
| visitor_text, | |
| ) | |
| def extract_xform_text( | |
| self, | |
| xform: EncodedStreamObject, | |
| orientations: Tuple[int, ...] = (0, 90, 270, 360), | |
| space_width: float = 200.0, | |
| visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, | |
| visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, | |
| visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, | |
| ) -> str: | |
| """ | |
| Extract text from an XObject. | |
| Args: | |
| space_width: force default space width (if not extracted from font (default 200) | |
| Returns: | |
| The extracted text | |
| """ | |
| return self._extract_text( | |
| xform, | |
| self.pdf, | |
| orientations, | |
| space_width, | |
| None, | |
| visitor_operand_before, | |
| visitor_operand_after, | |
| visitor_text, | |
| ) | |
| def extractText( | |
| self, Tj_sep: str = "", TJ_sep: str = "" | |
| ) -> str: # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :meth:`extract_text` instead. | |
| """ | |
| deprecation_with_replacement("extractText", "extract_text", "3.0.0") | |
| return self.extract_text() | |
| def _get_fonts(self) -> Tuple[Set[str], Set[str]]: | |
| """ | |
| Get the names of embedded fonts and unembedded fonts. | |
| :return: (Set of embedded fonts, set of unembedded fonts) | |
| """ | |
| obj = self.get_object() | |
| assert isinstance(obj, DictionaryObject) | |
| fonts, embedded = _get_fonts_walk(cast(DictionaryObject, obj[PG.RESOURCES])) | |
| unembedded = fonts - embedded | |
| return embedded, unembedded | |
| mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) | |
| """ | |
| A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, | |
| defining the boundaries of the physical medium on which the page is | |
| intended to be displayed or printed. | |
| """ | |
| def mediaBox(self) -> RectangleObject: # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :py:attr:`mediabox` instead. | |
| """ | |
| deprecation_with_replacement("mediaBox", "mediabox", "3.0.0") | |
| return self.mediabox | |
| def mediaBox(self, value: RectangleObject) -> None: # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :py:attr:`mediabox` instead. | |
| """ | |
| deprecation_with_replacement("mediaBox", "mediabox", "3.0.0") | |
| self.mediabox = value | |
| cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) | |
| """ | |
| A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, | |
| defining the visible region of default user space. When the page is | |
| displayed or printed, its contents are to be clipped (cropped) to this | |
| rectangle and then imposed on the output medium in some | |
| implementation-defined manner. Default value: same as :attr:`mediabox<mediabox>`. | |
| """ | |
| def cropBox(self) -> RectangleObject: # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :py:attr:`cropbox` instead. | |
| """ | |
| deprecation_with_replacement("cropBox", "cropbox", "3.0.0") | |
| return self.cropbox | |
| def cropBox(self, value: RectangleObject) -> None: # pragma: no cover | |
| deprecation_with_replacement("cropBox", "cropbox", "3.0.0") | |
| self.cropbox = value | |
| bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) | |
| """ | |
| A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, | |
| defining the region to which the contents of the page should be clipped | |
| when output in a production environment. | |
| """ | |
| def bleedBox(self) -> RectangleObject: # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :py:attr:`bleedbox` instead. | |
| """ | |
| deprecation_with_replacement("bleedBox", "bleedbox", "3.0.0") | |
| return self.bleedbox | |
| def bleedBox(self, value: RectangleObject) -> None: # pragma: no cover | |
| deprecation_with_replacement("bleedBox", "bleedbox", "3.0.0") | |
| self.bleedbox = value | |
| trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) | |
| """ | |
| A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, | |
| defining the intended dimensions of the finished page after trimming. | |
| """ | |
| def trimBox(self) -> RectangleObject: # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :py:attr:`trimbox` instead. | |
| """ | |
| deprecation_with_replacement("trimBox", "trimbox", "3.0.0") | |
| return self.trimbox | |
| def trimBox(self, value: RectangleObject) -> None: # pragma: no cover | |
| deprecation_with_replacement("trimBox", "trimbox", "3.0.0") | |
| self.trimbox = value | |
| artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) | |
| """ | |
| A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, | |
| defining the extent of the page's meaningful content as intended by the | |
| page's creator. | |
| """ | |
| def artBox(self) -> RectangleObject: # pragma: no cover | |
| """ | |
| .. deprecated:: 1.28.0 | |
| Use :py:attr:`artbox` instead. | |
| """ | |
| deprecation_with_replacement("artBox", "artbox", "3.0.0") | |
| return self.artbox | |
| def artBox(self, value: RectangleObject) -> None: # pragma: no cover | |
| deprecation_with_replacement("artBox", "artbox", "3.0.0") | |
| self.artbox = value | |
| def annotations(self) -> Optional[ArrayObject]: | |
| if "/Annots" not in self: | |
| return None | |
| else: | |
| return cast(ArrayObject, self["/Annots"]) | |
| def annotations(self, value: Optional[ArrayObject]) -> None: | |
| """ | |
| Set the annotations array of the page. | |
| Typically you don't want to set this value, but append to it. | |
| If you append to it, don't forget to add the object first to the writer | |
| and only add the indirect object. | |
| """ | |
| if value is None: | |
| del self[NameObject("/Annots")] | |
| else: | |
| self[NameObject("/Annots")] = value | |
| class _VirtualList: | |
| def __init__( | |
| self, | |
| length_function: Callable[[], int], | |
| get_function: Callable[[int], PageObject], | |
| ) -> None: | |
| self.length_function = length_function | |
| self.get_function = get_function | |
| self.current = -1 | |
| def __len__(self) -> int: | |
| return self.length_function() | |
| def __getitem__(self, index: int) -> PageObject: | |
| if isinstance(index, slice): | |
| indices = range(*index.indices(len(self))) | |
| cls = type(self) | |
| return cls(indices.__len__, lambda idx: self[indices[idx]]) # type: ignore | |
| if not isinstance(index, int): | |
| raise TypeError("sequence indices must be integers") | |
| len_self = len(self) | |
| if index < 0: | |
| # support negative indexes | |
| index = len_self + index | |
| if index < 0 or index >= len_self: | |
| raise IndexError("sequence index out of range") | |
| return self.get_function(index) | |
| def __iter__(self) -> Iterator[PageObject]: | |
| for i in range(len(self)): | |
| yield self[i] | |
| def _get_fonts_walk( | |
| obj: DictionaryObject, | |
| fnt: Optional[Set[str]] = None, | |
| emb: Optional[Set[str]] = None, | |
| ) -> Tuple[Set[str], Set[str]]: | |
| """ | |
| If there is a key called 'BaseFont', that is a font that is used in the document. | |
| If there is a key called 'FontName' and another key in the same dictionary object | |
| that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is | |
| embedded. | |
| We create and add to two sets, fnt = fonts used and emb = fonts embedded. | |
| """ | |
| if fnt is None: | |
| fnt = set() | |
| if emb is None: | |
| emb = set() | |
| if not hasattr(obj, "keys"): | |
| return set(), set() | |
| fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") | |
| if "/BaseFont" in obj: | |
| fnt.add(cast(str, obj["/BaseFont"])) | |
| if "/FontName" in obj: | |
| if [x for x in fontkeys if x in obj]: # test to see if there is FontFile | |
| emb.add(cast(str, obj["/FontName"])) | |
| for key in obj.keys(): | |
| _get_fonts_walk(cast(DictionaryObject, obj[key]), fnt, emb) | |
| return fnt, emb # return the sets for each page | |