| | """Tests to ensure that the html.parser tree builder generates good |
| | trees.""" |
| |
|
| | from pdb import set_trace |
| | import pickle |
| | import pytest |
| | import warnings |
| | from bs4.builder import ( |
| | HTMLParserTreeBuilder, |
| | ParserRejectedMarkup, |
| | XMLParsedAsHTMLWarning, |
| | ) |
| | from bs4.builder._htmlparser import BeautifulSoupHTMLParser |
| | from . import SoupTest, HTMLTreeBuilderSmokeTest |
| |
|
| | class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): |
| |
|
| | default_builder = HTMLParserTreeBuilder |
| |
|
| | def test_rejected_input(self): |
| | |
| | |
| | |
| | |
| | |
| | bad_markup = [ |
| | |
| | |
| | |
| | b'\n<![\xff\xfe\xfe\xcd\x00', |
| |
|
| | |
| | |
| | |
| | b'<![n\x00', |
| | b"<![UNKNOWN[]]>", |
| | ] |
| | for markup in bad_markup: |
| | with pytest.raises(ParserRejectedMarkup): |
| | soup = self.soup(markup) |
| | |
| | def test_namespaced_system_doctype(self): |
| | |
| | pass |
| |
|
| | def test_namespaced_public_doctype(self): |
| | |
| | pass |
| |
|
| | def test_builder_is_pickled(self): |
| | """Unlike most tree builders, HTMLParserTreeBuilder and will |
| | be restored after pickling. |
| | """ |
| | tree = self.soup("<a><b>foo</a>") |
| | dumped = pickle.dumps(tree, 2) |
| | loaded = pickle.loads(dumped) |
| | assert isinstance(loaded.builder, type(tree.builder)) |
| |
|
| | def test_redundant_empty_element_closing_tags(self): |
| | self.assert_soup('<br></br><br></br><br></br>', "<br/><br/><br/>") |
| | self.assert_soup('</br></br></br>', "") |
| |
|
| | def test_empty_element(self): |
| | |
| | |
| | self.assert_soup("foo &# bar", "foo &# bar") |
| |
|
| | def test_tracking_line_numbers(self): |
| | |
| | |
| | markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>" |
| | soup = self.soup(markup) |
| | assert 2 == soup.p.sourceline |
| | assert 3 == soup.p.sourcepos |
| | assert "sourceline" == soup.p.find('sourceline').name |
| |
|
| | |
| | soup = self.soup(markup, store_line_numbers=False) |
| | assert "sourceline" == soup.p.sourceline.name |
| | assert "sourcepos" == soup.p.sourcepos.name |
| |
|
| | def test_on_duplicate_attribute(self): |
| | |
| | |
| |
|
| | markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">' |
| |
|
| | |
| | |
| | soup = self.soup(markup) |
| | assert "url3" == soup.a['href'] |
| | assert ["cls"] == soup.a['class'] |
| | assert "id" == soup.a['id'] |
| | |
| | |
| | def assert_attribute(on_duplicate_attribute, expected): |
| | soup = self.soup( |
| | markup, on_duplicate_attribute=on_duplicate_attribute |
| | ) |
| | assert expected == soup.a['href'] |
| |
|
| | |
| | assert ["cls"] == soup.a['class'] |
| | assert "id" == soup.a['id'] |
| | assert_attribute(None, "url3") |
| | assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3") |
| |
|
| | |
| | assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1") |
| |
|
| | |
| | def accumulate(attrs, key, value): |
| | if not isinstance(attrs[key], list): |
| | attrs[key] = [attrs[key]] |
| | attrs[key].append(value) |
| | assert_attribute(accumulate, ["url1", "url2", "url3"]) |
| |
|
| | def test_html5_attributes(self): |
| | |
| | |
| | |
| | |
| | for input_element, output_unicode, output_element in ( |
| | ("⇄", '\u21c4', b'⇄'), |
| | ('⊧', '\u22a7', b'⊧'), |
| | ('𝔑', '\U0001d511', b'𝔑'), |
| | ('≧̸', '\u2267\u0338', b'≧̸'), |
| | ('¬', '\xac', b'¬'), |
| | ('⫬', '\u2aec', b'⫬'), |
| | ('"', '"', b'"'), |
| | ('∴', '\u2234', b'∴'), |
| | ('∴', '\u2234', b'∴'), |
| | ('∴', '\u2234', b'∴'), |
| | ("fj", 'fj', b'fj'), |
| | ("⊔", '\u2294', b'⊔'), |
| | ("⊔︀", '\u2294\ufe00', b'⊔︀'), |
| | ("'", "'", b"'"), |
| | ("|", "|", b"|"), |
| | ): |
| | markup = '<div>%s</div>' % input_element |
| | div = self.soup(markup).div |
| | without_element = div.encode() |
| | expect = b"<div>%s</div>" % output_unicode.encode("utf8") |
| | assert without_element == expect |
| |
|
| | with_element = div.encode(formatter="html") |
| | expect = b"<div>%s</div>" % output_element |
| | assert with_element == expect |
| |
|