Spaces:
Paused
Paused
| """Tests to ensure that the lxml tree builder generates good trees.""" | |
| import pickle | |
| import pytest | |
| import re | |
| import warnings | |
| from . import LXML_PRESENT, LXML_VERSION | |
| if LXML_PRESENT: | |
| from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML | |
| from bs4 import ( | |
| BeautifulSoup, | |
| BeautifulStoneSoup, | |
| ) | |
| from bs4.element import Comment, Doctype, SoupStrainer | |
| from . import ( | |
| HTMLTreeBuilderSmokeTest, | |
| XMLTreeBuilderSmokeTest, | |
| SOUP_SIEVE_PRESENT, | |
| SoupTest, | |
| ) | |
| class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): | |
| """See ``HTMLTreeBuilderSmokeTest``.""" | |
| def default_builder(self): | |
| return LXMLTreeBuilder | |
| def test_out_of_range_entity(self): | |
| self.assert_soup( | |
| "<p>foo�bar</p>", "<p>foobar</p>") | |
| self.assert_soup( | |
| "<p>foo�bar</p>", "<p>foobar</p>") | |
| self.assert_soup( | |
| "<p>foo�bar</p>", "<p>foobar</p>") | |
| def test_entities_in_foreign_document_encoding(self): | |
| # We can't implement this case correctly because by the time we | |
| # hear about markup like "“", it's been (incorrectly) converted into | |
| # a string like u'\x93' | |
| pass | |
| # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this | |
| # test if an old version of lxml is installed. | |
| def test_empty_doctype(self): | |
| soup = self.soup("<!DOCTYPE>") | |
| doctype = soup.contents[0] | |
| assert "" == doctype.strip() | |
| def test_beautifulstonesoup_is_xml_parser(self): | |
| # Make sure that the deprecated BSS class uses an xml builder | |
| # if one is installed. | |
| with warnings.catch_warnings(record=True) as w: | |
| soup = BeautifulStoneSoup("<b />") | |
| assert "<b/>" == str(soup.b) | |
| [warning] = w | |
| assert warning.filename == __file__ | |
| assert "BeautifulStoneSoup class is deprecated" in str(warning.message) | |
| def test_tracking_line_numbers(self): | |
| # The lxml TreeBuilder cannot keep track of line numbers from | |
| # the original markup. Even if you ask for line numbers, we | |
| # don't have 'em. | |
| # | |
| # This means that if you have a tag like <sourceline> or | |
| # <sourcepos>, attribute access will find it rather than | |
| # giving you a numeric answer. | |
| soup = self.soup( | |
| "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>", | |
| store_line_numbers=True | |
| ) | |
| assert "sourceline" == soup.p.sourceline.name | |
| assert "sourcepos" == soup.p.sourcepos.name | |
| class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest): | |
| """See ``HTMLTreeBuilderSmokeTest``.""" | |
| def default_builder(self): | |
| return LXMLTreeBuilderForXML | |
| def test_namespace_indexing(self): | |
| soup = self.soup( | |
| '<?xml version="1.1"?>\n' | |
| '<root>' | |
| '<tag xmlns="http://unprefixed-namespace.com">content</tag>' | |
| '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>' | |
| '<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">' | |
| '<subtag xmlns="http://another-unprefixed-namespace.com">' | |
| '<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">' | |
| '</prefix2:tag3>' | |
| '</root>' | |
| ) | |
| # The BeautifulSoup object includes every namespace prefix | |
| # defined in the entire document. This is the default set of | |
| # namespaces used by soupsieve. | |
| # | |
| # Un-prefixed namespaces are not included, and if a given | |
| # prefix is defined twice, only the first prefix encountered | |
| # in the document shows up here. | |
| assert soup._namespaces == { | |
| 'xml': 'http://www.w3.org/XML/1998/namespace', | |
| 'prefix': 'http://prefixed-namespace.com', | |
| 'prefix2': 'http://another-namespace.com' | |
| } | |
| # A Tag object includes only the namespace prefixes | |
| # that were in scope when it was parsed. | |
| # We do not track un-prefixed namespaces as we can only hold | |
| # one (the first one), and it will be recognized as the | |
| # default namespace by soupsieve, even when operating from a | |
| # tag with a different un-prefixed namespace. | |
| assert soup.tag._namespaces == { | |
| 'xml': 'http://www.w3.org/XML/1998/namespace', | |
| } | |
| assert soup.tag2._namespaces == { | |
| 'prefix': 'http://prefixed-namespace.com', | |
| 'xml': 'http://www.w3.org/XML/1998/namespace', | |
| } | |
| assert soup.subtag._namespaces == { | |
| 'prefix2': 'http://another-namespace.com', | |
| 'xml': 'http://www.w3.org/XML/1998/namespace', | |
| } | |
| assert soup.subsubtag._namespaces == { | |
| 'prefix2': 'http://another-namespace.com', | |
| 'xml': 'http://www.w3.org/XML/1998/namespace', | |
| } | |
| def test_namespace_interaction_with_select_and_find(self): | |
| # Demonstrate how namespaces interact with select* and | |
| # find* methods. | |
| soup = self.soup( | |
| '<?xml version="1.1"?>\n' | |
| '<root>' | |
| '<tag xmlns="http://unprefixed-namespace.com">content</tag>' | |
| '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>' | |
| '<subtag xmlns:prefix="http://another-namespace-same-prefix.com">' | |
| '<prefix:tag3>' | |
| '</subtag>' | |
| '</root>' | |
| ) | |
| # soupselect uses namespace URIs. | |
| assert soup.select_one('tag').name == 'tag' | |
| assert soup.select_one('prefix|tag2').name == 'tag2' | |
| # If a prefix is declared more than once, only the first usage | |
| # is registered with the BeautifulSoup object. | |
| assert soup.select_one('prefix|tag3') is None | |
| # But you can always explicitly specify a namespace dictionary. | |
| assert soup.select_one( | |
| 'prefix|tag3', namespaces=soup.subtag._namespaces | |
| ).name == 'tag3' | |
| # And a Tag (as opposed to the BeautifulSoup object) will | |
| # have a set of default namespaces scoped to that Tag. | |
| assert soup.subtag.select_one('prefix|tag3').name=='tag3' | |
| # the find() methods aren't fully namespace-aware; they just | |
| # look at prefixes. | |
| assert soup.find('tag').name == 'tag' | |
| assert soup.find('prefix:tag2').name == 'tag2' | |
| assert soup.find('prefix:tag3').name == 'tag3' | |
| assert soup.subtag.find('prefix:tag3').name == 'tag3' | |
| def test_pickle_restores_builder(self): | |
| # The lxml TreeBuilder is not picklable, so when unpickling | |
| # a document created with it, a new TreeBuilder of the | |
| # appropriate class is created. | |
| soup = self.soup("<a>some markup</a>") | |
| assert isinstance(soup.builder, self.default_builder) | |
| pickled = pickle.dumps(soup) | |
| unpickled = pickle.loads(pickled) | |
| assert "some markup" == unpickled.a.string | |
| assert unpickled.builder != soup.builder | |
| assert isinstance(unpickled.builder, self.default_builder) | |