Spaces:

gilzero
/

editor-app-v10

Paused

App Files Files Community

editor-app-v10 / myenv /lib /python3.10 /site-packages /docx /oxml /text /pagebreak.py

gilzero

Upload folder using huggingface_hub

cb1a5c9 verified almost 2 years ago

raw

history blame contribute delete

11.3 kB

	"""Custom element class for rendered page-break (CT_LastRenderedPageBreak)."""

	from __future__ import annotations

	import copy
	from typing import TYPE_CHECKING

	from docx.oxml.xmlchemy import BaseOxmlElement
	from docx.shared import lazyproperty

	if TYPE_CHECKING:
	from docx.oxml.text.hyperlink import CT_Hyperlink
	from docx.oxml.text.paragraph import CT_P


	class CT_LastRenderedPageBreak(BaseOxmlElement):
	"""`<w:lastRenderedPageBreak>` element, indicating page break inserted by renderer.

	A rendered page-break is one inserted by the renderer when it runs out of room on a
	page. It is an empty element (no attrs or children) and is a child of CT_R, peer to
	CT_Text.

	NOTE: this complex-type name does not exist in the schema, where
	`w:lastRenderedPageBreak` maps to `CT_Empty`. This name was added to give it
	distinguished behavior. CT_Empty is used for many elements.
	"""

	@property
	def following_fragment_p(self) -> CT_P:
	"""A "loose" `CT_P` containing only the paragraph content before this break.

	Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
	page-break in its paragraph.

	The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
	page-break with this `w:lastRenderedPageBreak` element and all content preceding
	it removed.

	NOTE: this `w:p` can itself contain one or more `w:renderedPageBreak` elements
	(when the paragraph contained more than one). While this is rare, the caller
	should treat this paragraph the same as other paragraphs and split it if
	necessary in a folloing step or recursion.
	"""
	if not self == self._first_lrpb_in_p(self._enclosing_p):
	raise ValueError("only defined on first rendered page-break in paragraph")

	# -- splitting approach is different when break is inside a hyperlink --
	return (
	self._following_frag_in_hlink
	if self._is_in_hyperlink
	else self._following_frag_in_run
	)

	@property
	def follows_all_content(self) -> bool:
	"""True when this page-break element is the last "content" in the paragraph.

	This is very uncommon case and may only occur in contrived or cases where the
	XML is edited by hand, but it is not precluded by the spec.
	"""
	# -- a page-break inside a hyperlink never meets these criteria (for our
	# -- purposes at least) because it is considered "atomic" and always associated
	# -- with the page it starts on.
	if self._is_in_hyperlink:
	return False

	return bool(
	# -- XPath will match zero-or-one w:lastRenderedPageBreak element --
	self._enclosing_p.xpath(
	# -- in first run of paragraph --
	f"(./w:r)[last()]"
	# -- all page-breaks --
	f"/w:lastRenderedPageBreak"
	# -- that are not preceded by any content-bearing elements --
	f"[not(following-sibling::*[{self._run_inner_content_xpath}])]"
	)
	)

	@property
	def precedes_all_content(self) -> bool:
	"""True when a `w:lastRenderedPageBreak` precedes all paragraph content.

	This is a common case; it occurs whenever the page breaks on an even paragraph
	boundary.
	"""
	# -- a page-break inside a hyperlink never meets these criteria because there
	# -- is always part of the hyperlink text before the page-break.
	if self._is_in_hyperlink:
	return False

	return bool(
	# -- XPath will match zero-or-one w:lastRenderedPageBreak element --
	self._enclosing_p.xpath(
	# -- in first run of paragraph --
	f"./w:r[1]"
	# -- all page-breaks --
	f"/w:lastRenderedPageBreak"
	# -- that are not preceded by any content-bearing elements --
	f"[not(preceding-sibling::*[{self._run_inner_content_xpath}])]"
	)
	)

	@property
	def preceding_fragment_p(self) -> CT_P:
	"""A "loose" `CT_P` containing only the paragraph content before this break.

	Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
	paragraph in its paragraph.

	The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
	page-break with this `w:lastRenderedPageBreak` element and all its following
	siblings removed.
	"""
	if not self == self._first_lrpb_in_p(self._enclosing_p):
	raise ValueError("only defined on first rendered page-break in paragraph")

	# -- splitting approach is different when break is inside a hyperlink --
	return (
	self._preceding_frag_in_hlink
	if self._is_in_hyperlink
	else self._preceding_frag_in_run
	)

	def _enclosing_hyperlink(self, lrpb: CT_LastRenderedPageBreak) -> CT_Hyperlink:
	"""The `w:hyperlink` grandparent of this `w:lastRenderedPageBreak`.

	Raises `IndexError` when this page-break has a `w:p` grandparent, so only call
	when `._is_in_hyperlink` is True.
	"""
	return lrpb.xpath("./parent::w:r/parent::w:hyperlink")[0]

	@property
	def _enclosing_p(self) -> CT_P:
	"""The `w:p` element parent or grandparent of this `w:lastRenderedPageBreak`."""
	return self.xpath("./ancestor::w:p[1]")[0]

	def _first_lrpb_in_p(self, p: CT_P) -> CT_LastRenderedPageBreak:
	"""The first `w:lastRenderedPageBreak` element in `p`.

	Raises `ValueError` if there are no rendered page-breaks in `p`.
	"""
	lrpbs = p.xpath(
	"./w:r/w:lastRenderedPageBreak \| ./w:hyperlink/w:r/w:lastRenderedPageBreak"
	)
	if not lrpbs:
	raise ValueError("no rendered page-breaks in paragraph element")
	return lrpbs[0]

	@lazyproperty
	def _following_frag_in_hlink(self) -> CT_P:
	"""Following CT_P fragment when break occurs within a hyperlink.

	Note this is a partial-function and raises when `lrpb` is not inside a
	hyperlink.
	"""
	if not self._is_in_hyperlink:
	raise ValueError("only defined on a rendered page-break in a hyperlink")

	# -- work on a clone `w:p` so our mutations don't persist --
	p = copy.deepcopy(self._enclosing_p)

	# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
	lrpb = self._first_lrpb_in_p(p)

	# -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
	hyperlink = lrpb._enclosing_hyperlink(lrpb)

	# -- delete all w:p inner-content preceding the hyperlink --
	for e in hyperlink.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
	p.remove(e)

	# -- remove the whole hyperlink, it belongs to the preceding-fragment-p --
	hyperlink.getparent().remove(hyperlink)

	# -- that's it, return the remaining fragment of `w:p` clone --
	return p

	@lazyproperty
	def _following_frag_in_run(self) -> CT_P:
	"""following CT_P fragment when break does not occur in a hyperlink.

	Note this is a partial-function and raises when `lrpb` is inside a hyperlink.
	"""
	if self._is_in_hyperlink:
	raise ValueError("only defined on a rendered page-break not in a hyperlink")

	# -- work on a clone `w:p` so our mutations don't persist --
	p = copy.deepcopy(self._enclosing_p)

	# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
	lrpb = self._first_lrpb_in_p(p)

	# -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
	enclosing_r = lrpb.xpath("./parent::w:r")[0]

	# -- delete all w:p inner-content preceding that run (but not w:pPr) --
	for e in enclosing_r.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
	p.remove(e)

	# -- then remove all run inner-content preceding this lrpb in its run (but not
	# -- the `w:rPr`) and also remove the page-break itself
	for e in lrpb.xpath("./preceding-sibling::*[not(self::w:rPr)]"):
	enclosing_r.remove(e)
	enclosing_r.remove(lrpb)

	return p

	@lazyproperty
	def _is_in_hyperlink(self) -> bool:
	"""True when this page-break is embedded in a hyperlink run."""
	return bool(self.xpath("./parent::w:r/parent::w:hyperlink"))

	@lazyproperty
	def _preceding_frag_in_hlink(self) -> CT_P:
	"""Preceding CT_P fragment when break occurs within a hyperlink.

	Note this is a partial-function and raises when `lrpb` is not inside a
	hyperlink.
	"""
	if not self._is_in_hyperlink:
	raise ValueError("only defined on a rendered page-break in a hyperlink")

	# -- work on a clone `w:p` so our mutations don't persist --
	p = copy.deepcopy(self._enclosing_p)

	# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
	lrpb = self._first_lrpb_in_p(p)

	# -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
	hyperlink = lrpb._enclosing_hyperlink(lrpb)

	# -- delete all w:p inner-content following the hyperlink --
	for e in hyperlink.xpath("./following-sibling::*"):
	p.remove(e)

	# -- remove this page-break from inside the hyperlink --
	lrpb.getparent().remove(lrpb)

	# -- that's it, the entire hyperlink goes into the preceding fragment so
	# -- the hyperlink is not "split".
	return p

	@lazyproperty
	def _preceding_frag_in_run(self) -> CT_P:
	"""Preceding CT_P fragment when break does not occur in a hyperlink.

	Note this is a partial-function and raises when `lrpb` is inside a hyperlink.
	"""
	if self._is_in_hyperlink:
	raise ValueError("only defined on a rendered page-break not in a hyperlink")

	# -- work on a clone `w:p` so our mutations don't persist --
	p = copy.deepcopy(self._enclosing_p)

	# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
	lrpb = self._first_lrpb_in_p(p)

	# -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
	enclosing_r = lrpb.xpath("./parent::w:r")[0]

	# -- delete all `w:p` inner-content following that run --
	for e in enclosing_r.xpath("./following-sibling::*"):
	p.remove(e)

	# -- then delete all `w:r` inner-content following this lrpb in its run and
	# -- also remove the page-break itself
	for e in lrpb.xpath("./following-sibling::*"):
	enclosing_r.remove(e)
	enclosing_r.remove(lrpb)

	return p

	@lazyproperty
	def _run_inner_content_xpath(self) -> str:
	"""XPath fragment matching any run inner-content elements."""
	return (
	"self::w:br"
	" \| self::w:cr"
	" \| self::w:drawing"
	" \| self::w:noBreakHyphen"
	" \| self::w:ptab"
	" \| self::w:t"
	" \| self::w:tab"
	)