Spaces:
Paused
Paused
File size: 2,324 Bytes
cb1a5c9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | # pyright: reportImportCycles=false
"""XML parser for python-docx."""
from __future__ import annotations
from typing import TYPE_CHECKING, Dict, Type, cast
from lxml import etree
from docx.oxml.ns import NamespacePrefixedTag, nsmap
if TYPE_CHECKING:
from docx.oxml.xmlchemy import BaseOxmlElement
# -- configure XML parser --
element_class_lookup = etree.ElementNamespaceClassLookup()
oxml_parser = etree.XMLParser(remove_blank_text=True, resolve_entities=False)
oxml_parser.set_element_class_lookup(element_class_lookup)
def parse_xml(xml: str | bytes) -> "BaseOxmlElement":
"""Root lxml element obtained by parsing XML character string `xml`.
The custom parser is used, so custom element classes are produced for elements in
`xml` that have them.
"""
return cast("BaseOxmlElement", etree.fromstring(xml, oxml_parser))
def register_element_cls(tag: str, cls: Type["BaseOxmlElement"]):
"""Register an lxml custom element-class to use for `tag`.
A instance of `cls` to be constructed when the oxml parser encounters an element
with matching `tag`. `tag` is a string of the form `nspfx:tagroot`, e.g.
`'w:document'`.
"""
nspfx, tagroot = tag.split(":")
namespace = element_class_lookup.get_namespace(nsmap[nspfx])
namespace[tagroot] = cls
def OxmlElement(
nsptag_str: str,
attrs: Dict[str, str] | None = None,
nsdecls: Dict[str, str] | None = None,
) -> BaseOxmlElement | etree._Element: # pyright: ignore[reportPrivateUsage]
"""Return a 'loose' lxml element having the tag specified by `nsptag_str`.
The tag in `nsptag_str` must contain the standard namespace prefix, e.g. `a:tbl`.
The resulting element is an instance of the custom element class for this tag name
if one is defined. A dictionary of attribute values may be provided as `attrs`; they
are set if present. All namespaces defined in the dict `nsdecls` are declared in the
element using the key as the prefix and the value as the namespace name. If
`nsdecls` is not provided, a single namespace declaration is added based on the
prefix on `nsptag_str`.
"""
nsptag = NamespacePrefixedTag(nsptag_str)
if nsdecls is None:
nsdecls = nsptag.nsmap
return oxml_parser.makeelement(nsptag.clark_name, attrib=attrs, nsmap=nsdecls)
|