Spaces:
Paused
Paused
| """Custom element class for rendered page-break (CT_LastRenderedPageBreak).""" | |
| from __future__ import annotations | |
| import copy | |
| from typing import TYPE_CHECKING | |
| from docx.oxml.xmlchemy import BaseOxmlElement | |
| from docx.shared import lazyproperty | |
| if TYPE_CHECKING: | |
| from docx.oxml.text.hyperlink import CT_Hyperlink | |
| from docx.oxml.text.paragraph import CT_P | |
| class CT_LastRenderedPageBreak(BaseOxmlElement): | |
| """`<w:lastRenderedPageBreak>` element, indicating page break inserted by renderer. | |
| A rendered page-break is one inserted by the renderer when it runs out of room on a | |
| page. It is an empty element (no attrs or children) and is a child of CT_R, peer to | |
| CT_Text. | |
| NOTE: this complex-type name does not exist in the schema, where | |
| `w:lastRenderedPageBreak` maps to `CT_Empty`. This name was added to give it | |
| distinguished behavior. CT_Empty is used for many elements. | |
| """ | |
| def following_fragment_p(self) -> CT_P: | |
| """A "loose" `CT_P` containing only the paragraph content before this break. | |
| Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered | |
| page-break in its paragraph. | |
| The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this | |
| page-break with this `w:lastRenderedPageBreak` element and all content preceding | |
| it removed. | |
| NOTE: this `w:p` can itself contain one or more `w:renderedPageBreak` elements | |
| (when the paragraph contained more than one). While this is rare, the caller | |
| should treat this paragraph the same as other paragraphs and split it if | |
| necessary in a folloing step or recursion. | |
| """ | |
| if not self == self._first_lrpb_in_p(self._enclosing_p): | |
| raise ValueError("only defined on first rendered page-break in paragraph") | |
| # -- splitting approach is different when break is inside a hyperlink -- | |
| return ( | |
| self._following_frag_in_hlink | |
| if self._is_in_hyperlink | |
| else self._following_frag_in_run | |
| ) | |
| def follows_all_content(self) -> bool: | |
| """True when this page-break element is the last "content" in the paragraph. | |
| This is very uncommon case and may only occur in contrived or cases where the | |
| XML is edited by hand, but it is not precluded by the spec. | |
| """ | |
| # -- a page-break inside a hyperlink never meets these criteria (for our | |
| # -- purposes at least) because it is considered "atomic" and always associated | |
| # -- with the page it starts on. | |
| if self._is_in_hyperlink: | |
| return False | |
| return bool( | |
| # -- XPath will match zero-or-one w:lastRenderedPageBreak element -- | |
| self._enclosing_p.xpath( | |
| # -- in first run of paragraph -- | |
| f"(./w:r)[last()]" | |
| # -- all page-breaks -- | |
| f"/w:lastRenderedPageBreak" | |
| # -- that are not preceded by any content-bearing elements -- | |
| f"[not(following-sibling::*[{self._run_inner_content_xpath}])]" | |
| ) | |
| ) | |
| def precedes_all_content(self) -> bool: | |
| """True when a `w:lastRenderedPageBreak` precedes all paragraph content. | |
| This is a common case; it occurs whenever the page breaks on an even paragraph | |
| boundary. | |
| """ | |
| # -- a page-break inside a hyperlink never meets these criteria because there | |
| # -- is always part of the hyperlink text before the page-break. | |
| if self._is_in_hyperlink: | |
| return False | |
| return bool( | |
| # -- XPath will match zero-or-one w:lastRenderedPageBreak element -- | |
| self._enclosing_p.xpath( | |
| # -- in first run of paragraph -- | |
| f"./w:r[1]" | |
| # -- all page-breaks -- | |
| f"/w:lastRenderedPageBreak" | |
| # -- that are not preceded by any content-bearing elements -- | |
| f"[not(preceding-sibling::*[{self._run_inner_content_xpath}])]" | |
| ) | |
| ) | |
| def preceding_fragment_p(self) -> CT_P: | |
| """A "loose" `CT_P` containing only the paragraph content before this break. | |
| Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered | |
| paragraph in its paragraph. | |
| The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this | |
| page-break with this `w:lastRenderedPageBreak` element and all its following | |
| siblings removed. | |
| """ | |
| if not self == self._first_lrpb_in_p(self._enclosing_p): | |
| raise ValueError("only defined on first rendered page-break in paragraph") | |
| # -- splitting approach is different when break is inside a hyperlink -- | |
| return ( | |
| self._preceding_frag_in_hlink | |
| if self._is_in_hyperlink | |
| else self._preceding_frag_in_run | |
| ) | |
| def _enclosing_hyperlink(self, lrpb: CT_LastRenderedPageBreak) -> CT_Hyperlink: | |
| """The `w:hyperlink` grandparent of this `w:lastRenderedPageBreak`. | |
| Raises `IndexError` when this page-break has a `w:p` grandparent, so only call | |
| when `._is_in_hyperlink` is True. | |
| """ | |
| return lrpb.xpath("./parent::w:r/parent::w:hyperlink")[0] | |
| def _enclosing_p(self) -> CT_P: | |
| """The `w:p` element parent or grandparent of this `w:lastRenderedPageBreak`.""" | |
| return self.xpath("./ancestor::w:p[1]")[0] | |
| def _first_lrpb_in_p(self, p: CT_P) -> CT_LastRenderedPageBreak: | |
| """The first `w:lastRenderedPageBreak` element in `p`. | |
| Raises `ValueError` if there are no rendered page-breaks in `p`. | |
| """ | |
| lrpbs = p.xpath( | |
| "./w:r/w:lastRenderedPageBreak | ./w:hyperlink/w:r/w:lastRenderedPageBreak" | |
| ) | |
| if not lrpbs: | |
| raise ValueError("no rendered page-breaks in paragraph element") | |
| return lrpbs[0] | |
| def _following_frag_in_hlink(self) -> CT_P: | |
| """Following CT_P fragment when break occurs within a hyperlink. | |
| Note this is a *partial-function* and raises when `lrpb` is not inside a | |
| hyperlink. | |
| """ | |
| if not self._is_in_hyperlink: | |
| raise ValueError("only defined on a rendered page-break in a hyperlink") | |
| # -- work on a clone `w:p` so our mutations don't persist -- | |
| p = copy.deepcopy(self._enclosing_p) | |
| # -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) -- | |
| lrpb = self._first_lrpb_in_p(p) | |
| # -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found -- | |
| hyperlink = lrpb._enclosing_hyperlink(lrpb) | |
| # -- delete all w:p inner-content preceding the hyperlink -- | |
| for e in hyperlink.xpath("./preceding-sibling::*[not(self::w:pPr)]"): | |
| p.remove(e) | |
| # -- remove the whole hyperlink, it belongs to the preceding-fragment-p -- | |
| hyperlink.getparent().remove(hyperlink) | |
| # -- that's it, return the remaining fragment of `w:p` clone -- | |
| return p | |
| def _following_frag_in_run(self) -> CT_P: | |
| """following CT_P fragment when break does not occur in a hyperlink. | |
| Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink. | |
| """ | |
| if self._is_in_hyperlink: | |
| raise ValueError("only defined on a rendered page-break not in a hyperlink") | |
| # -- work on a clone `w:p` so our mutations don't persist -- | |
| p = copy.deepcopy(self._enclosing_p) | |
| # -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) -- | |
| lrpb = self._first_lrpb_in_p(p) | |
| # -- locate `w:r` in which this `w:lastRenderedPageBreak` is found -- | |
| enclosing_r = lrpb.xpath("./parent::w:r")[0] | |
| # -- delete all w:p inner-content preceding that run (but not w:pPr) -- | |
| for e in enclosing_r.xpath("./preceding-sibling::*[not(self::w:pPr)]"): | |
| p.remove(e) | |
| # -- then remove all run inner-content preceding this lrpb in its run (but not | |
| # -- the `w:rPr`) and also remove the page-break itself | |
| for e in lrpb.xpath("./preceding-sibling::*[not(self::w:rPr)]"): | |
| enclosing_r.remove(e) | |
| enclosing_r.remove(lrpb) | |
| return p | |
| def _is_in_hyperlink(self) -> bool: | |
| """True when this page-break is embedded in a hyperlink run.""" | |
| return bool(self.xpath("./parent::w:r/parent::w:hyperlink")) | |
| def _preceding_frag_in_hlink(self) -> CT_P: | |
| """Preceding CT_P fragment when break occurs within a hyperlink. | |
| Note this is a *partial-function* and raises when `lrpb` is not inside a | |
| hyperlink. | |
| """ | |
| if not self._is_in_hyperlink: | |
| raise ValueError("only defined on a rendered page-break in a hyperlink") | |
| # -- work on a clone `w:p` so our mutations don't persist -- | |
| p = copy.deepcopy(self._enclosing_p) | |
| # -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) -- | |
| lrpb = self._first_lrpb_in_p(p) | |
| # -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found -- | |
| hyperlink = lrpb._enclosing_hyperlink(lrpb) | |
| # -- delete all w:p inner-content following the hyperlink -- | |
| for e in hyperlink.xpath("./following-sibling::*"): | |
| p.remove(e) | |
| # -- remove this page-break from inside the hyperlink -- | |
| lrpb.getparent().remove(lrpb) | |
| # -- that's it, the entire hyperlink goes into the preceding fragment so | |
| # -- the hyperlink is not "split". | |
| return p | |
| def _preceding_frag_in_run(self) -> CT_P: | |
| """Preceding CT_P fragment when break does not occur in a hyperlink. | |
| Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink. | |
| """ | |
| if self._is_in_hyperlink: | |
| raise ValueError("only defined on a rendered page-break not in a hyperlink") | |
| # -- work on a clone `w:p` so our mutations don't persist -- | |
| p = copy.deepcopy(self._enclosing_p) | |
| # -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) -- | |
| lrpb = self._first_lrpb_in_p(p) | |
| # -- locate `w:r` in which this `w:lastRenderedPageBreak` is found -- | |
| enclosing_r = lrpb.xpath("./parent::w:r")[0] | |
| # -- delete all `w:p` inner-content following that run -- | |
| for e in enclosing_r.xpath("./following-sibling::*"): | |
| p.remove(e) | |
| # -- then delete all `w:r` inner-content following this lrpb in its run and | |
| # -- also remove the page-break itself | |
| for e in lrpb.xpath("./following-sibling::*"): | |
| enclosing_r.remove(e) | |
| enclosing_r.remove(lrpb) | |
| return p | |
| def _run_inner_content_xpath(self) -> str: | |
| """XPath fragment matching any run inner-content elements.""" | |
| return ( | |
| "self::w:br" | |
| " | self::w:cr" | |
| " | self::w:drawing" | |
| " | self::w:noBreakHyphen" | |
| " | self::w:ptab" | |
| " | self::w:t" | |
| " | self::w:tab" | |
| ) | |