| """ |
| :mod:``pandas.io.xml`` is a module for reading XML. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import io |
| from os import PathLike |
| from typing import ( |
| TYPE_CHECKING, |
| Any, |
| Callable, |
| ) |
| import warnings |
|
|
| from pandas._libs import lib |
| from pandas.compat._optional import import_optional_dependency |
| from pandas.errors import ( |
| AbstractMethodError, |
| ParserError, |
| ) |
| from pandas.util._decorators import doc |
| from pandas.util._exceptions import find_stack_level |
| from pandas.util._validators import check_dtype_backend |
|
|
| from pandas.core.dtypes.common import is_list_like |
|
|
| from pandas.core.shared_docs import _shared_docs |
|
|
| from pandas.io.common import ( |
| file_exists, |
| get_handle, |
| infer_compression, |
| is_file_like, |
| is_fsspec_url, |
| is_url, |
| stringify_path, |
| ) |
| from pandas.io.parsers import TextParser |
|
|
| if TYPE_CHECKING: |
| from collections.abc import Sequence |
| from xml.etree.ElementTree import Element |
|
|
| from lxml import etree |
|
|
| from pandas._typing import ( |
| CompressionOptions, |
| ConvertersArg, |
| DtypeArg, |
| DtypeBackend, |
| FilePath, |
| ParseDatesArg, |
| ReadBuffer, |
| StorageOptions, |
| XMLParsers, |
| ) |
|
|
| from pandas import DataFrame |
|
|
|
|
| @doc( |
| storage_options=_shared_docs["storage_options"], |
| decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", |
| ) |
| class _XMLFrameParser: |
| """ |
| Internal subclass to parse XML into DataFrames. |
| |
| Parameters |
| ---------- |
| path_or_buffer : a valid JSON ``str``, path object or file-like object |
| Any valid string path is acceptable. The string could be a URL. Valid |
| URL schemes include http, ftp, s3, and file. |
| |
| xpath : str or regex |
| The ``XPath`` expression to parse required set of nodes for |
| migration to :class:`~pandas.DataFrame`. ``etree`` supports limited ``XPath``. |
| |
| namespaces : dict |
| The namespaces defined in XML document (``xmlns:namespace='URI'``) |
| as dicts with key being namespace and value the URI. |
| |
| elems_only : bool |
| Parse only the child elements at the specified ``xpath``. |
| |
| attrs_only : bool |
| Parse only the attributes at the specified ``xpath``. |
| |
| names : list |
| Column names for :class:`~pandas.DataFrame` of parsed XML data. |
| |
| dtype : dict |
| Data type for data or columns. E.g. {{'a': np.float64, |
| 'b': np.int32, 'c': 'Int64'}} |
| |
| .. versionadded:: 1.5.0 |
| |
| converters : dict, optional |
| Dict of functions for converting values in certain columns. Keys can |
| either be integers or column labels. |
| |
| .. versionadded:: 1.5.0 |
| |
| parse_dates : bool or list of int or names or list of lists or dict |
| Converts either index or select columns to datetimes |
| |
| .. versionadded:: 1.5.0 |
| |
| encoding : str |
| Encoding of xml object or document. |
| |
| stylesheet : str or file-like |
| URL, file, file-like object, or a raw string containing XSLT, |
| ``etree`` does not support XSLT but retained for consistency. |
| |
| iterparse : dict, optional |
| Dict with row element as key and list of descendant elements |
| and/or attributes as value to be retrieved in iterparsing of |
| XML document. |
| |
| .. versionadded:: 1.5.0 |
| |
| {decompression_options} |
| |
| .. versionchanged:: 1.4.0 Zstandard support. |
| |
| {storage_options} |
| |
| See also |
| -------- |
| pandas.io.xml._EtreeFrameParser |
| pandas.io.xml._LxmlFrameParser |
| |
| Notes |
| ----- |
| To subclass this class effectively you must override the following methods:` |
| * :func:`parse_data` |
| * :func:`_parse_nodes` |
| * :func:`_iterparse_nodes` |
| * :func:`_parse_doc` |
| * :func:`_validate_names` |
| * :func:`_validate_path` |
| |
| |
| See each method's respective documentation for details on their |
| functionality. |
| """ |
|
|
| def __init__( |
| self, |
| path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], |
| xpath: str, |
| namespaces: dict[str, str] | None, |
| elems_only: bool, |
| attrs_only: bool, |
| names: Sequence[str] | None, |
| dtype: DtypeArg | None, |
| converters: ConvertersArg | None, |
| parse_dates: ParseDatesArg | None, |
| encoding: str | None, |
| stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, |
| iterparse: dict[str, list[str]] | None, |
| compression: CompressionOptions, |
| storage_options: StorageOptions, |
| ) -> None: |
| self.path_or_buffer = path_or_buffer |
| self.xpath = xpath |
| self.namespaces = namespaces |
| self.elems_only = elems_only |
| self.attrs_only = attrs_only |
| self.names = names |
| self.dtype = dtype |
| self.converters = converters |
| self.parse_dates = parse_dates |
| self.encoding = encoding |
| self.stylesheet = stylesheet |
| self.iterparse = iterparse |
| self.is_style = None |
| self.compression: CompressionOptions = compression |
| self.storage_options = storage_options |
|
|
| def parse_data(self) -> list[dict[str, str | None]]: |
| """ |
| Parse xml data. |
| |
| This method will call the other internal methods to |
| validate ``xpath``, names, parse and return specific nodes. |
| """ |
|
|
| raise AbstractMethodError(self) |
|
|
| def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]: |
| """ |
| Parse xml nodes. |
| |
| This method will parse the children and attributes of elements |
| in ``xpath``, conditionally for only elements, only attributes |
| or both while optionally renaming node names. |
| |
| Raises |
| ------ |
| ValueError |
| * If only elements and only attributes are specified. |
| |
| Notes |
| ----- |
| Namespace URIs will be removed from return node values. Also, |
| elements with missing children or attributes compared to siblings |
| will have optional keys filled with None values. |
| """ |
|
|
| dicts: list[dict[str, str | None]] |
|
|
| if self.elems_only and self.attrs_only: |
| raise ValueError("Either element or attributes can be parsed not both.") |
| if self.elems_only: |
| if self.names: |
| dicts = [ |
| { |
| **( |
| {el.tag: el.text} |
| if el.text and not el.text.isspace() |
| else {} |
| ), |
| **{ |
| nm: ch.text if ch.text else None |
| for nm, ch in zip(self.names, el.findall("*")) |
| }, |
| } |
| for el in elems |
| ] |
| else: |
| dicts = [ |
| {ch.tag: ch.text if ch.text else None for ch in el.findall("*")} |
| for el in elems |
| ] |
|
|
| elif self.attrs_only: |
| dicts = [ |
| {k: v if v else None for k, v in el.attrib.items()} for el in elems |
| ] |
|
|
| elif self.names: |
| dicts = [ |
| { |
| **el.attrib, |
| **({el.tag: el.text} if el.text and not el.text.isspace() else {}), |
| **{ |
| nm: ch.text if ch.text else None |
| for nm, ch in zip(self.names, el.findall("*")) |
| }, |
| } |
| for el in elems |
| ] |
|
|
| else: |
| dicts = [ |
| { |
| **el.attrib, |
| **({el.tag: el.text} if el.text and not el.text.isspace() else {}), |
| **{ch.tag: ch.text if ch.text else None for ch in el.findall("*")}, |
| } |
| for el in elems |
| ] |
|
|
| dicts = [ |
| {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts |
| ] |
|
|
| keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) |
| dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] |
|
|
| if self.names: |
| dicts = [dict(zip(self.names, d.values())) for d in dicts] |
|
|
| return dicts |
|
|
| def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: |
| """ |
| Iterparse xml nodes. |
| |
| This method will read in local disk, decompressed XML files for elements |
| and underlying descendants using iterparse, a method to iterate through |
| an XML tree without holding entire XML tree in memory. |
| |
| Raises |
| ------ |
| TypeError |
| * If ``iterparse`` is not a dict or its dict value is not list-like. |
| ParserError |
| * If ``path_or_buffer`` is not a physical file on disk or file-like object. |
| * If no data is returned from selected items in ``iterparse``. |
| |
| Notes |
| ----- |
| Namespace URIs will be removed from return node values. Also, |
| elements with missing children or attributes in submitted list |
| will have optional keys filled with None values. |
| """ |
|
|
| dicts: list[dict[str, str | None]] = [] |
| row: dict[str, str | None] | None = None |
|
|
| if not isinstance(self.iterparse, dict): |
| raise TypeError( |
| f"{type(self.iterparse).__name__} is not a valid type for iterparse" |
| ) |
|
|
| row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" |
| if not is_list_like(self.iterparse[row_node]): |
| raise TypeError( |
| f"{type(self.iterparse[row_node])} is not a valid type " |
| "for value in iterparse" |
| ) |
|
|
| if (not hasattr(self.path_or_buffer, "read")) and ( |
| not isinstance(self.path_or_buffer, (str, PathLike)) |
| or is_url(self.path_or_buffer) |
| or is_fsspec_url(self.path_or_buffer) |
| or ( |
| isinstance(self.path_or_buffer, str) |
| and self.path_or_buffer.startswith(("<?xml", "<")) |
| ) |
| or infer_compression(self.path_or_buffer, "infer") is not None |
| ): |
| raise ParserError( |
| "iterparse is designed for large XML files that are fully extracted on " |
| "local disk and not as compressed files or online sources." |
| ) |
|
|
| iterparse_repeats = len(self.iterparse[row_node]) != len( |
| set(self.iterparse[row_node]) |
| ) |
|
|
| for event, elem in iterparse(self.path_or_buffer, events=("start", "end")): |
| curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag |
|
|
| if event == "start": |
| if curr_elem == row_node: |
| row = {} |
|
|
| if row is not None: |
| if self.names and iterparse_repeats: |
| for col, nm in zip(self.iterparse[row_node], self.names): |
| if curr_elem == col: |
| elem_val = elem.text if elem.text else None |
| if elem_val not in row.values() and nm not in row: |
| row[nm] = elem_val |
|
|
| if col in elem.attrib: |
| if elem.attrib[col] not in row.values() and nm not in row: |
| row[nm] = elem.attrib[col] |
| else: |
| for col in self.iterparse[row_node]: |
| if curr_elem == col: |
| row[col] = elem.text if elem.text else None |
| if col in elem.attrib: |
| row[col] = elem.attrib[col] |
|
|
| if event == "end": |
| if curr_elem == row_node and row is not None: |
| dicts.append(row) |
| row = None |
|
|
| elem.clear() |
| if hasattr(elem, "getprevious"): |
| while ( |
| elem.getprevious() is not None and elem.getparent() is not None |
| ): |
| del elem.getparent()[0] |
|
|
| if dicts == []: |
| raise ParserError("No result from selected items in iterparse.") |
|
|
| keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) |
| dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] |
|
|
| if self.names: |
| dicts = [dict(zip(self.names, d.values())) for d in dicts] |
|
|
| return dicts |
|
|
| def _validate_path(self) -> list[Any]: |
| """ |
| Validate ``xpath``. |
| |
| This method checks for syntax, evaluation, or empty nodes return. |
| |
| Raises |
| ------ |
| SyntaxError |
| * If xpah is not supported or issues with namespaces. |
| |
| ValueError |
| * If xpah does not return any nodes. |
| """ |
|
|
| raise AbstractMethodError(self) |
|
|
| def _validate_names(self) -> None: |
| """ |
| Validate names. |
| |
| This method will check if names is a list-like and aligns |
| with length of parse nodes. |
| |
| Raises |
| ------ |
| ValueError |
| * If value is not a list and less then length of nodes. |
| """ |
| raise AbstractMethodError(self) |
|
|
| def _parse_doc( |
| self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] |
| ) -> Element | etree._Element: |
| """ |
| Build tree from path_or_buffer. |
| |
| This method will parse XML object into tree |
| either from string/bytes or file location. |
| """ |
| raise AbstractMethodError(self) |
|
|
|
|
| class _EtreeFrameParser(_XMLFrameParser): |
| """ |
| Internal class to parse XML into DataFrames with the Python |
| standard library XML module: `xml.etree.ElementTree`. |
| """ |
|
|
| def parse_data(self) -> list[dict[str, str | None]]: |
| from xml.etree.ElementTree import iterparse |
|
|
| if self.stylesheet is not None: |
| raise ValueError( |
| "To use stylesheet, you need lxml installed and selected as parser." |
| ) |
|
|
| if self.iterparse is None: |
| self.xml_doc = self._parse_doc(self.path_or_buffer) |
| elems = self._validate_path() |
|
|
| self._validate_names() |
|
|
| xml_dicts: list[dict[str, str | None]] = ( |
| self._parse_nodes(elems) |
| if self.iterparse is None |
| else self._iterparse_nodes(iterparse) |
| ) |
|
|
| return xml_dicts |
|
|
| def _validate_path(self) -> list[Any]: |
| """ |
| Notes |
| ----- |
| ``etree`` supports limited ``XPath``. If user attempts a more complex |
| expression syntax error will raise. |
| """ |
|
|
| msg = ( |
| "xpath does not return any nodes or attributes. " |
| "Be sure to specify in `xpath` the parent nodes of " |
| "children and attributes to parse. " |
| "If document uses namespaces denoted with " |
| "xmlns, be sure to define namespaces and " |
| "use them in xpath." |
| ) |
| try: |
| elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) |
| children = [ch for el in elems for ch in el.findall("*")] |
| attrs = {k: v for el in elems for k, v in el.attrib.items()} |
|
|
| if elems is None: |
| raise ValueError(msg) |
|
|
| if elems is not None: |
| if self.elems_only and children == []: |
| raise ValueError(msg) |
| if self.attrs_only and attrs == {}: |
| raise ValueError(msg) |
| if children == [] and attrs == {}: |
| raise ValueError(msg) |
|
|
| except (KeyError, SyntaxError): |
| raise SyntaxError( |
| "You have used an incorrect or unsupported XPath " |
| "expression for etree library or you used an " |
| "undeclared namespace prefix." |
| ) |
|
|
| return elems |
|
|
| def _validate_names(self) -> None: |
| children: list[Any] |
|
|
| if self.names: |
| if self.iterparse: |
| children = self.iterparse[next(iter(self.iterparse))] |
| else: |
| parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces) |
| children = parent.findall("*") if parent is not None else [] |
|
|
| if is_list_like(self.names): |
| if len(self.names) < len(children): |
| raise ValueError( |
| "names does not match length of child elements in xpath." |
| ) |
| else: |
| raise TypeError( |
| f"{type(self.names).__name__} is not a valid type for names" |
| ) |
|
|
| def _parse_doc( |
| self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] |
| ) -> Element: |
| from xml.etree.ElementTree import ( |
| XMLParser, |
| parse, |
| ) |
|
|
| handle_data = get_data_from_filepath( |
| filepath_or_buffer=raw_doc, |
| encoding=self.encoding, |
| compression=self.compression, |
| storage_options=self.storage_options, |
| ) |
|
|
| with preprocess_data(handle_data) as xml_data: |
| curr_parser = XMLParser(encoding=self.encoding) |
| document = parse(xml_data, parser=curr_parser) |
|
|
| return document.getroot() |
|
|
|
|
| class _LxmlFrameParser(_XMLFrameParser): |
| """ |
| Internal class to parse XML into :class:`~pandas.DataFrame` with third-party |
| full-featured XML library, ``lxml``, that supports |
| ``XPath`` 1.0 and XSLT 1.0. |
| """ |
|
|
| def parse_data(self) -> list[dict[str, str | None]]: |
| """ |
| Parse xml data. |
| |
| This method will call the other internal methods to |
| validate ``xpath``, names, optionally parse and run XSLT, |
| and parse original or transformed XML and return specific nodes. |
| """ |
| from lxml.etree import iterparse |
|
|
| if self.iterparse is None: |
| self.xml_doc = self._parse_doc(self.path_or_buffer) |
|
|
| if self.stylesheet: |
| self.xsl_doc = self._parse_doc(self.stylesheet) |
| self.xml_doc = self._transform_doc() |
|
|
| elems = self._validate_path() |
|
|
| self._validate_names() |
|
|
| xml_dicts: list[dict[str, str | None]] = ( |
| self._parse_nodes(elems) |
| if self.iterparse is None |
| else self._iterparse_nodes(iterparse) |
| ) |
|
|
| return xml_dicts |
|
|
| def _validate_path(self) -> list[Any]: |
| msg = ( |
| "xpath does not return any nodes or attributes. " |
| "Be sure to specify in `xpath` the parent nodes of " |
| "children and attributes to parse. " |
| "If document uses namespaces denoted with " |
| "xmlns, be sure to define namespaces and " |
| "use them in xpath." |
| ) |
|
|
| elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) |
| children = [ch for el in elems for ch in el.xpath("*")] |
| attrs = {k: v for el in elems for k, v in el.attrib.items()} |
|
|
| if elems == []: |
| raise ValueError(msg) |
|
|
| if elems != []: |
| if self.elems_only and children == []: |
| raise ValueError(msg) |
| if self.attrs_only and attrs == {}: |
| raise ValueError(msg) |
| if children == [] and attrs == {}: |
| raise ValueError(msg) |
|
|
| return elems |
|
|
| def _validate_names(self) -> None: |
| children: list[Any] |
|
|
| if self.names: |
| if self.iterparse: |
| children = self.iterparse[next(iter(self.iterparse))] |
| else: |
| children = self.xml_doc.xpath( |
| self.xpath + "[1]/*", namespaces=self.namespaces |
| ) |
|
|
| if is_list_like(self.names): |
| if len(self.names) < len(children): |
| raise ValueError( |
| "names does not match length of child elements in xpath." |
| ) |
| else: |
| raise TypeError( |
| f"{type(self.names).__name__} is not a valid type for names" |
| ) |
|
|
| def _parse_doc( |
| self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] |
| ) -> etree._Element: |
| from lxml.etree import ( |
| XMLParser, |
| fromstring, |
| parse, |
| ) |
|
|
| handle_data = get_data_from_filepath( |
| filepath_or_buffer=raw_doc, |
| encoding=self.encoding, |
| compression=self.compression, |
| storage_options=self.storage_options, |
| ) |
|
|
| with preprocess_data(handle_data) as xml_data: |
| curr_parser = XMLParser(encoding=self.encoding) |
|
|
| if isinstance(xml_data, io.StringIO): |
| if self.encoding is None: |
| raise TypeError( |
| "Can not pass encoding None when input is StringIO." |
| ) |
|
|
| document = fromstring( |
| xml_data.getvalue().encode(self.encoding), parser=curr_parser |
| ) |
| else: |
| document = parse(xml_data, parser=curr_parser) |
|
|
| return document |
|
|
| def _transform_doc(self) -> etree._XSLTResultTree: |
| """ |
| Transform original tree using stylesheet. |
| |
| This method will transform original xml using XSLT script into |
| am ideally flatter xml document for easier parsing and migration |
| to Data Frame. |
| """ |
| from lxml.etree import XSLT |
|
|
| transformer = XSLT(self.xsl_doc) |
| new_doc = transformer(self.xml_doc) |
|
|
| return new_doc |
|
|
|
|
| def get_data_from_filepath( |
| filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str], |
| encoding: str | None, |
| compression: CompressionOptions, |
| storage_options: StorageOptions, |
| ) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: |
| """ |
| Extract raw XML data. |
| |
| The method accepts three input types: |
| 1. filepath (string-like) |
| 2. file-like object (e.g. open file object, StringIO) |
| 3. XML string or bytes |
| |
| This method turns (1) into (2) to simplify the rest of the processing. |
| It returns input types (2) and (3) unchanged. |
| """ |
| if not isinstance(filepath_or_buffer, bytes): |
| filepath_or_buffer = stringify_path(filepath_or_buffer) |
|
|
| if ( |
| isinstance(filepath_or_buffer, str) |
| and not filepath_or_buffer.startswith(("<?xml", "<")) |
| ) and ( |
| not isinstance(filepath_or_buffer, str) |
| or is_url(filepath_or_buffer) |
| or is_fsspec_url(filepath_or_buffer) |
| or file_exists(filepath_or_buffer) |
| ): |
| with get_handle( |
| filepath_or_buffer, |
| "r", |
| encoding=encoding, |
| compression=compression, |
| storage_options=storage_options, |
| ) as handle_obj: |
| filepath_or_buffer = ( |
| handle_obj.handle.read() |
| if hasattr(handle_obj.handle, "read") |
| else handle_obj.handle |
| ) |
|
|
| return filepath_or_buffer |
|
|
|
|
| def preprocess_data(data) -> io.StringIO | io.BytesIO: |
| """ |
| Convert extracted raw data. |
| |
| This method will return underlying data of extracted XML content. |
| The data either has a `read` attribute (e.g. a file object or a |
| StringIO/BytesIO) or is a string or bytes that is an XML document. |
| """ |
|
|
| if isinstance(data, str): |
| data = io.StringIO(data) |
|
|
| elif isinstance(data, bytes): |
| data = io.BytesIO(data) |
|
|
| return data |
|
|
|
|
| def _data_to_frame(data, **kwargs) -> DataFrame: |
| """ |
| Convert parsed data to Data Frame. |
| |
| This method will bind xml dictionary data of keys and values |
| into named columns of Data Frame using the built-in TextParser |
| class that build Data Frame and infers specific dtypes. |
| """ |
|
|
| tags = next(iter(data)) |
| nodes = [list(d.values()) for d in data] |
|
|
| try: |
| with TextParser(nodes, names=tags, **kwargs) as tp: |
| return tp.read() |
| except ParserError: |
| raise ParserError( |
| "XML document may be too complex for import. " |
| "Try to flatten document and use distinct " |
| "element and attribute names." |
| ) |
|
|
|
|
| def _parse( |
| path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], |
| xpath: str, |
| namespaces: dict[str, str] | None, |
| elems_only: bool, |
| attrs_only: bool, |
| names: Sequence[str] | None, |
| dtype: DtypeArg | None, |
| converters: ConvertersArg | None, |
| parse_dates: ParseDatesArg | None, |
| encoding: str | None, |
| parser: XMLParsers, |
| stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, |
| iterparse: dict[str, list[str]] | None, |
| compression: CompressionOptions, |
| storage_options: StorageOptions, |
| dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, |
| **kwargs, |
| ) -> DataFrame: |
| """ |
| Call internal parsers. |
| |
| This method will conditionally call internal parsers: |
| LxmlFrameParser and/or EtreeParser. |
| |
| Raises |
| ------ |
| ImportError |
| * If lxml is not installed if selected as parser. |
| |
| ValueError |
| * If parser is not lxml or etree. |
| """ |
|
|
| p: _EtreeFrameParser | _LxmlFrameParser |
|
|
| if isinstance(path_or_buffer, str) and not any( |
| [ |
| is_file_like(path_or_buffer), |
| file_exists(path_or_buffer), |
| is_url(path_or_buffer), |
| is_fsspec_url(path_or_buffer), |
| ] |
| ): |
| warnings.warn( |
| "Passing literal xml to 'read_xml' is deprecated and " |
| "will be removed in a future version. To read from a " |
| "literal string, wrap it in a 'StringIO' object.", |
| FutureWarning, |
| stacklevel=find_stack_level(), |
| ) |
|
|
| if parser == "lxml": |
| lxml = import_optional_dependency("lxml.etree", errors="ignore") |
|
|
| if lxml is not None: |
| p = _LxmlFrameParser( |
| path_or_buffer, |
| xpath, |
| namespaces, |
| elems_only, |
| attrs_only, |
| names, |
| dtype, |
| converters, |
| parse_dates, |
| encoding, |
| stylesheet, |
| iterparse, |
| compression, |
| storage_options, |
| ) |
| else: |
| raise ImportError("lxml not found, please install or use the etree parser.") |
|
|
| elif parser == "etree": |
| p = _EtreeFrameParser( |
| path_or_buffer, |
| xpath, |
| namespaces, |
| elems_only, |
| attrs_only, |
| names, |
| dtype, |
| converters, |
| parse_dates, |
| encoding, |
| stylesheet, |
| iterparse, |
| compression, |
| storage_options, |
| ) |
| else: |
| raise ValueError("Values for parser can only be lxml or etree.") |
|
|
| data_dicts = p.parse_data() |
|
|
| return _data_to_frame( |
| data=data_dicts, |
| dtype=dtype, |
| converters=converters, |
| parse_dates=parse_dates, |
| dtype_backend=dtype_backend, |
| **kwargs, |
| ) |
|
|
|
|
| @doc( |
| storage_options=_shared_docs["storage_options"], |
| decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", |
| ) |
| def read_xml( |
| path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], |
| *, |
| xpath: str = "./*", |
| namespaces: dict[str, str] | None = None, |
| elems_only: bool = False, |
| attrs_only: bool = False, |
| names: Sequence[str] | None = None, |
| dtype: DtypeArg | None = None, |
| converters: ConvertersArg | None = None, |
| parse_dates: ParseDatesArg | None = None, |
| |
| encoding: str | None = "utf-8", |
| parser: XMLParsers = "lxml", |
| stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, |
| iterparse: dict[str, list[str]] | None = None, |
| compression: CompressionOptions = "infer", |
| storage_options: StorageOptions | None = None, |
| dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, |
| ) -> DataFrame: |
| r""" |
| Read XML document into a :class:`~pandas.DataFrame` object. |
| |
| .. versionadded:: 1.3.0 |
| |
| Parameters |
| ---------- |
| path_or_buffer : str, path object, or file-like object |
| String, path object (implementing ``os.PathLike[str]``), or file-like |
| object implementing a ``read()`` function. The string can be any valid XML |
| string or a path. The string can further be a URL. Valid URL schemes |
| include http, ftp, s3, and file. |
| |
| .. deprecated:: 2.1.0 |
| Passing xml literal strings is deprecated. |
| Wrap literal xml input in ``io.StringIO`` or ``io.BytesIO`` instead. |
| |
| xpath : str, optional, default './\*' |
| The ``XPath`` to parse required set of nodes for migration to |
| :class:`~pandas.DataFrame`.``XPath`` should return a collection of elements |
| and not a single element. Note: The ``etree`` parser supports limited ``XPath`` |
| expressions. For more complex ``XPath``, use ``lxml`` which requires |
| installation. |
| |
| namespaces : dict, optional |
| The namespaces defined in XML document as dicts with key being |
| namespace prefix and value the URI. There is no need to include all |
| namespaces in XML, only the ones used in ``xpath`` expression. |
| Note: if XML document uses default namespace denoted as |
| `xmlns='<URI>'` without a prefix, you must assign any temporary |
| namespace prefix such as 'doc' to the URI in order to parse |
| underlying nodes and/or attributes. For example, :: |
| |
| namespaces = {{"doc": "https://example.com"}} |
| |
| elems_only : bool, optional, default False |
| Parse only the child elements at the specified ``xpath``. By default, |
| all child elements and non-empty text nodes are returned. |
| |
| attrs_only : bool, optional, default False |
| Parse only the attributes at the specified ``xpath``. |
| By default, all attributes are returned. |
| |
| names : list-like, optional |
| Column names for DataFrame of parsed XML data. Use this parameter to |
| rename original element names and distinguish same named elements and |
| attributes. |
| |
| dtype : Type name or dict of column -> type, optional |
| Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, |
| 'c': 'Int64'}} |
| Use `str` or `object` together with suitable `na_values` settings |
| to preserve and not interpret dtype. |
| If converters are specified, they will be applied INSTEAD |
| of dtype conversion. |
| |
| .. versionadded:: 1.5.0 |
| |
| converters : dict, optional |
| Dict of functions for converting values in certain columns. Keys can either |
| be integers or column labels. |
| |
| .. versionadded:: 1.5.0 |
| |
| parse_dates : bool or list of int or names or list of lists or dict, default False |
| Identifiers to parse index or columns to datetime. The behavior is as follows: |
| |
| * boolean. If True -> try parsing the index. |
| * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 |
| each as a separate date column. |
| * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as |
| a single date column. |
| * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call |
| result 'foo' |
| |
| .. versionadded:: 1.5.0 |
| |
| encoding : str, optional, default 'utf-8' |
| Encoding of XML document. |
| |
| parser : {{'lxml','etree'}}, default 'lxml' |
| Parser module to use for retrieval of data. Only 'lxml' and |
| 'etree' are supported. With 'lxml' more complex ``XPath`` searches |
| and ability to use XSLT stylesheet are supported. |
| |
| stylesheet : str, path object or file-like object |
| A URL, file-like object, or a raw string containing an XSLT script. |
| This stylesheet should flatten complex, deeply nested XML documents |
| for easier parsing. To use this feature you must have ``lxml`` module |
| installed and specify 'lxml' as ``parser``. The ``xpath`` must |
| reference nodes of transformed XML document generated after XSLT |
| transformation and not the original XML document. Only XSLT 1.0 |
| scripts and not later versions is currently supported. |
| |
| iterparse : dict, optional |
| The nodes or attributes to retrieve in iterparsing of XML document |
| as a dict with key being the name of repeating element and value being |
| list of elements or attribute names that are descendants of the repeated |
| element. Note: If this option is used, it will replace ``xpath`` parsing |
| and unlike ``xpath``, descendants do not need to relate to each other but can |
| exist any where in document under the repeating element. This memory- |
| efficient method should be used for very large XML files (500MB, 1GB, or 5GB+). |
| For example, :: |
| |
| iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}} |
| |
| .. versionadded:: 1.5.0 |
| |
| {decompression_options} |
| |
| .. versionchanged:: 1.4.0 Zstandard support. |
| |
| {storage_options} |
| |
| dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' |
| Back-end data type applied to the resultant :class:`DataFrame` |
| (still experimental). Behaviour is as follows: |
| |
| * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` |
| (default). |
| * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` |
| DataFrame. |
| |
| .. versionadded:: 2.0 |
| |
| Returns |
| ------- |
| df |
| A DataFrame. |
| |
| See Also |
| -------- |
| read_json : Convert a JSON string to pandas object. |
| read_html : Read HTML tables into a list of DataFrame objects. |
| |
| Notes |
| ----- |
| This method is best designed to import shallow XML documents in |
| following format which is the ideal fit for the two-dimensions of a |
| ``DataFrame`` (row by column). :: |
| |
| <root> |
| <row> |
| <column1>data</column1> |
| <column2>data</column2> |
| <column3>data</column3> |
| ... |
| </row> |
| <row> |
| ... |
| </row> |
| ... |
| </root> |
| |
| As a file format, XML documents can be designed any way including |
| layout of elements and attributes as long as it conforms to W3C |
| specifications. Therefore, this method is a convenience handler for |
| a specific flatter design and not all possible XML structures. |
| |
| However, for more complex XML documents, ``stylesheet`` allows you to |
| temporarily redesign original document with XSLT (a special purpose |
| language) for a flatter version for migration to a DataFrame. |
| |
| This function will *always* return a single :class:`DataFrame` or raise |
| exceptions due to issues with XML document, ``xpath``, or other |
| parameters. |
| |
| See the :ref:`read_xml documentation in the IO section of the docs |
| <io.read_xml>` for more information in using this method to parse XML |
| files to DataFrames. |
| |
| Examples |
| -------- |
| >>> from io import StringIO |
| >>> xml = '''<?xml version='1.0' encoding='utf-8'?> |
| ... <data xmlns="http://example.com"> |
| ... <row> |
| ... <shape>square</shape> |
| ... <degrees>360</degrees> |
| ... <sides>4.0</sides> |
| ... </row> |
| ... <row> |
| ... <shape>circle</shape> |
| ... <degrees>360</degrees> |
| ... <sides/> |
| ... </row> |
| ... <row> |
| ... <shape>triangle</shape> |
| ... <degrees>180</degrees> |
| ... <sides>3.0</sides> |
| ... </row> |
| ... </data>''' |
| |
| >>> df = pd.read_xml(StringIO(xml)) |
| >>> df |
| shape degrees sides |
| 0 square 360 4.0 |
| 1 circle 360 NaN |
| 2 triangle 180 3.0 |
| |
| >>> xml = '''<?xml version='1.0' encoding='utf-8'?> |
| ... <data> |
| ... <row shape="square" degrees="360" sides="4.0"/> |
| ... <row shape="circle" degrees="360"/> |
| ... <row shape="triangle" degrees="180" sides="3.0"/> |
| ... </data>''' |
| |
| >>> df = pd.read_xml(StringIO(xml), xpath=".//row") |
| >>> df |
| shape degrees sides |
| 0 square 360 4.0 |
| 1 circle 360 NaN |
| 2 triangle 180 3.0 |
| |
| >>> xml = '''<?xml version='1.0' encoding='utf-8'?> |
| ... <doc:data xmlns:doc="https://example.com"> |
| ... <doc:row> |
| ... <doc:shape>square</doc:shape> |
| ... <doc:degrees>360</doc:degrees> |
| ... <doc:sides>4.0</doc:sides> |
| ... </doc:row> |
| ... <doc:row> |
| ... <doc:shape>circle</doc:shape> |
| ... <doc:degrees>360</doc:degrees> |
| ... <doc:sides/> |
| ... </doc:row> |
| ... <doc:row> |
| ... <doc:shape>triangle</doc:shape> |
| ... <doc:degrees>180</doc:degrees> |
| ... <doc:sides>3.0</doc:sides> |
| ... </doc:row> |
| ... </doc:data>''' |
| |
| >>> df = pd.read_xml(StringIO(xml), |
| ... xpath="//doc:row", |
| ... namespaces={{"doc": "https://example.com"}}) |
| >>> df |
| shape degrees sides |
| 0 square 360 4.0 |
| 1 circle 360 NaN |
| 2 triangle 180 3.0 |
| |
| >>> xml_data = ''' |
| ... <data> |
| ... <row> |
| ... <index>0</index> |
| ... <a>1</a> |
| ... <b>2.5</b> |
| ... <c>True</c> |
| ... <d>a</d> |
| ... <e>2019-12-31 00:00:00</e> |
| ... </row> |
| ... <row> |
| ... <index>1</index> |
| ... <b>4.5</b> |
| ... <c>False</c> |
| ... <d>b</d> |
| ... <e>2019-12-31 00:00:00</e> |
| ... </row> |
| ... </data> |
| ... ''' |
| |
| >>> df = pd.read_xml(StringIO(xml_data), |
| ... dtype_backend="numpy_nullable", |
| ... parse_dates=["e"]) |
| >>> df |
| index a b c d e |
| 0 0 1 2.5 True a 2019-12-31 |
| 1 1 <NA> 4.5 False b 2019-12-31 |
| """ |
| check_dtype_backend(dtype_backend) |
|
|
| return _parse( |
| path_or_buffer=path_or_buffer, |
| xpath=xpath, |
| namespaces=namespaces, |
| elems_only=elems_only, |
| attrs_only=attrs_only, |
| names=names, |
| dtype=dtype, |
| converters=converters, |
| parse_dates=parse_dates, |
| encoding=encoding, |
| parser=parser, |
| stylesheet=stylesheet, |
| iterparse=iterparse, |
| compression=compression, |
| storage_options=storage_options, |
| dtype_backend=dtype_backend, |
| ) |
|
|