Spaces:
Paused
Paused
| # encoding: utf-8 | |
| import pytest | |
| import logging | |
| import bs4 | |
| from bs4 import BeautifulSoup | |
| from bs4.dammit import ( | |
| EntitySubstitution, | |
| EncodingDetector, | |
| UnicodeDammit, | |
| ) | |
| class TestUnicodeDammit(object): | |
| """Standalone tests of UnicodeDammit.""" | |
| def test_unicode_input(self): | |
| markup = "I'm already Unicode! \N{SNOWMAN}" | |
| dammit = UnicodeDammit(markup) | |
| assert dammit.unicode_markup == markup | |
| def test_smart_quotes_to(self, smart_quotes_to, expect_converted): | |
| """Verify the functionality of the smart_quotes_to argument | |
| to the UnicodeDammit constructor.""" | |
| markup = b"<foo>\x91\x92\x93\x94</foo>" | |
| converted = UnicodeDammit( | |
| markup, known_definite_encodings=["windows-1252"], | |
| smart_quotes_to=smart_quotes_to | |
| ).unicode_markup | |
| assert converted == "<foo>{}</foo>".format(expect_converted) | |
| def test_detect_utf8(self): | |
| utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" | |
| dammit = UnicodeDammit(utf8) | |
| assert dammit.original_encoding.lower() == 'utf-8' | |
| assert dammit.unicode_markup == 'Sacr\xe9 bleu! \N{SNOWMAN}' | |
| def test_convert_hebrew(self): | |
| hebrew = b"\xed\xe5\xec\xf9" | |
| dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) | |
| assert dammit.original_encoding.lower() == 'iso-8859-8' | |
| assert dammit.unicode_markup == '\u05dd\u05d5\u05dc\u05e9' | |
| def test_dont_see_smart_quotes_where_there_are_none(self): | |
| utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" | |
| dammit = UnicodeDammit(utf_8) | |
| assert dammit.original_encoding.lower() == 'utf-8' | |
| assert dammit.unicode_markup.encode("utf-8") == utf_8 | |
| def test_ignore_inappropriate_codecs(self): | |
| utf8_data = "Räksmörgås".encode("utf-8") | |
| dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) | |
| assert dammit.original_encoding.lower() == 'utf-8' | |
| def test_ignore_invalid_codecs(self): | |
| utf8_data = "Räksmörgås".encode("utf-8") | |
| for bad_encoding in ['.utf8', '...', 'utF---16.!']: | |
| dammit = UnicodeDammit(utf8_data, [bad_encoding]) | |
| assert dammit.original_encoding.lower() == 'utf-8' | |
| def test_exclude_encodings(self): | |
| # This is UTF-8. | |
| utf8_data = "Räksmörgås".encode("utf-8") | |
| # But if we exclude UTF-8 from consideration, the guess is | |
| # Windows-1252. | |
| dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) | |
| assert dammit.original_encoding.lower() == 'windows-1252' | |
| # And if we exclude that, there is no valid guess at all. | |
| dammit = UnicodeDammit( | |
| utf8_data, exclude_encodings=["utf-8", "windows-1252"]) | |
| assert dammit.original_encoding == None | |
| class TestEncodingDetector(object): | |
| def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): | |
| detected = EncodingDetector( | |
| b'<?xml version="1.0" encoding="UTF-\xdb" ?>') | |
| encodings = list(detected.encodings) | |
| assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings | |
| def test_detect_html5_style_meta_tag(self): | |
| for data in ( | |
| b'<html><meta charset="euc-jp" /></html>', | |
| b"<html><meta charset='euc-jp' /></html>", | |
| b"<html><meta charset=euc-jp /></html>", | |
| b"<html><meta charset=euc-jp/></html>"): | |
| dammit = UnicodeDammit(data, is_html=True) | |
| assert "euc-jp" == dammit.original_encoding | |
| def test_last_ditch_entity_replacement(self): | |
| # This is a UTF-8 document that contains bytestrings | |
| # completely incompatible with UTF-8 (ie. encoded with some other | |
| # encoding). | |
| # | |
| # Since there is no consistent encoding for the document, | |
| # Unicode, Dammit will eventually encode the document as UTF-8 | |
| # and encode the incompatible characters as REPLACEMENT | |
| # CHARACTER. | |
| # | |
| # If chardet is installed, it will detect that the document | |
| # can be converted into ISO-8859-1 without errors. This happens | |
| # to be the wrong encoding, but it is a consistent encoding, so the | |
| # code we're testing here won't run. | |
| # | |
| # So we temporarily disable chardet if it's present. | |
| doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> | |
| <html><b>\330\250\330\252\330\261</b> | |
| <i>\310\322\321\220\312\321\355\344</i></html>""" | |
| chardet = bs4.dammit.chardet_dammit | |
| logging.disable(logging.WARNING) | |
| try: | |
| def noop(str): | |
| return None | |
| bs4.dammit.chardet_dammit = noop | |
| dammit = UnicodeDammit(doc) | |
| assert True == dammit.contains_replacement_characters | |
| assert "\ufffd" in dammit.unicode_markup | |
| soup = BeautifulSoup(doc, "html.parser") | |
| assert soup.contains_replacement_characters | |
| finally: | |
| logging.disable(logging.NOTSET) | |
| bs4.dammit.chardet_dammit = chardet | |
| def test_byte_order_mark_removed(self): | |
| # A document written in UTF-16LE will have its byte order marker stripped. | |
| data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' | |
| dammit = UnicodeDammit(data) | |
| assert "<a>áé</a>" == dammit.unicode_markup | |
| assert "utf-16le" == dammit.original_encoding | |
| def test_known_definite_versus_user_encodings(self): | |
| # The known_definite_encodings are used before sniffing the | |
| # byte-order mark; the user_encodings are used afterwards. | |
| # Here's a document in UTF-16LE. | |
| data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' | |
| dammit = UnicodeDammit(data) | |
| # We can process it as UTF-16 by passing it in as a known | |
| # definite encoding. | |
| before = UnicodeDammit(data, known_definite_encodings=["utf-16"]) | |
| assert "utf-16" == before.original_encoding | |
| # If we pass UTF-18 as a user encoding, it's not even | |
| # tried--the encoding sniffed from the byte-order mark takes | |
| # precedence. | |
| after = UnicodeDammit(data, user_encodings=["utf-8"]) | |
| assert "utf-16le" == after.original_encoding | |
| assert ["utf-16le"] == [x[0] for x in dammit.tried_encodings] | |
| # Here's a document in ISO-8859-8. | |
| hebrew = b"\xed\xe5\xec\xf9" | |
| dammit = UnicodeDammit(hebrew, known_definite_encodings=["utf-8"], | |
| user_encodings=["iso-8859-8"]) | |
| # The known_definite_encodings don't work, BOM sniffing does | |
| # nothing (it only works for a few UTF encodings), but one of | |
| # the user_encodings does work. | |
| assert "iso-8859-8" == dammit.original_encoding | |
| assert ["utf-8", "iso-8859-8"] == [x[0] for x in dammit.tried_encodings] | |
| def test_deprecated_override_encodings(self): | |
| # override_encodings is a deprecated alias for | |
| # known_definite_encodings. | |
| hebrew = b"\xed\xe5\xec\xf9" | |
| dammit = UnicodeDammit( | |
| hebrew, | |
| known_definite_encodings=["shift-jis"], | |
| override_encodings=["utf-8"], | |
| user_encodings=["iso-8859-8"], | |
| ) | |
| assert "iso-8859-8" == dammit.original_encoding | |
| # known_definite_encodings and override_encodings were tried | |
| # before user_encodings. | |
| assert ["shift-jis", "utf-8", "iso-8859-8"] == ( | |
| [x[0] for x in dammit.tried_encodings] | |
| ) | |
| def test_detwingle(self): | |
| # Here's a UTF8 document. | |
| utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") | |
| # Here's a Windows-1252 document. | |
| windows_1252 = ( | |
| "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" | |
| "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") | |
| # Through some unholy alchemy, they've been stuck together. | |
| doc = utf8 + windows_1252 + utf8 | |
| # The document can't be turned into UTF-8: | |
| with pytest.raises(UnicodeDecodeError): | |
| doc.decode("utf8") | |
| # Unicode, Dammit thinks the whole document is Windows-1252, | |
| # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" | |
| # But if we run it through fix_embedded_windows_1252, it's fixed: | |
| fixed = UnicodeDammit.detwingle(doc) | |
| assert "☃☃☃“Hi, I like Windows!”☃☃☃" == fixed.decode("utf8") | |
| def test_detwingle_ignores_multibyte_characters(self): | |
| # Each of these characters has a UTF-8 representation ending | |
| # in \x93. \x93 is a smart quote if interpreted as | |
| # Windows-1252. But our code knows to skip over multibyte | |
| # UTF-8 characters, so they'll survive the process unscathed. | |
| for tricky_unicode_char in ( | |
| "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' | |
| "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' | |
| "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. | |
| ): | |
| input = tricky_unicode_char.encode("utf8") | |
| assert input.endswith(b'\x93') | |
| output = UnicodeDammit.detwingle(input) | |
| assert output == input | |
| def test_find_declared_encoding(self): | |
| # Test our ability to find a declared encoding inside an | |
| # XML or HTML document. | |
| # | |
| # Even if the document comes in as Unicode, it may be | |
| # interesting to know what encoding was claimed | |
| # originally. | |
| html_unicode = '<html><head><meta charset="utf-8"></head></html>' | |
| html_bytes = html_unicode.encode("ascii") | |
| xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>' | |
| xml_bytes = xml_unicode.encode("ascii") | |
| m = EncodingDetector.find_declared_encoding | |
| assert m(html_unicode, is_html=False) is None | |
| assert "utf-8" == m(html_unicode, is_html=True) | |
| assert "utf-8" == m(html_bytes, is_html=True) | |
| assert "iso-8859-1" == m(xml_unicode) | |
| assert "iso-8859-1" == m(xml_bytes) | |
| # Normally, only the first few kilobytes of a document are checked for | |
| # an encoding. | |
| spacer = b' ' * 5000 | |
| assert m(spacer + html_bytes) is None | |
| assert m(spacer + xml_bytes) is None | |
| # But you can tell find_declared_encoding to search an entire | |
| # HTML document. | |
| assert ( | |
| m(spacer + html_bytes, is_html=True, search_entire_document=True) | |
| == "utf-8" | |
| ) | |
| # The XML encoding declaration has to be the very first thing | |
| # in the document. We'll allow whitespace before the document | |
| # starts, but nothing else. | |
| assert m(xml_bytes, search_entire_document=True) == "iso-8859-1" | |
| assert m(b' ' + xml_bytes, search_entire_document=True) == "iso-8859-1" | |
| assert m(b'a' + xml_bytes, search_entire_document=True) is None | |
| class TestEntitySubstitution(object): | |
| """Standalone tests of the EntitySubstitution class.""" | |
| def setup_method(self): | |
| self.sub = EntitySubstitution | |
| def test_substitute_html(self, original, substituted): | |
| assert self.sub.substitute_html(original) == substituted | |
| def test_html5_entity(self): | |
| for entity, u in ( | |
| # A few spot checks of our ability to recognize | |
| # special character sequences and convert them | |
| # to named entities. | |
| ('⊧', '\u22a7'), | |
| ('𝔑', '\U0001d511'), | |
| ('≧̸', '\u2267\u0338'), | |
| ('¬', '\xac'), | |
| ('⫬', '\u2aec'), | |
| # We _could_ convert | to &verbarr;, but we don't, because | |
| # | is an ASCII character. | |
| ('|' '|'), | |
| # Similarly for the fj ligature, which we could convert to | |
| # fj, but we don't. | |
| ("fj", "fj"), | |
| # We do convert _these_ ASCII characters to HTML entities, | |
| # because that's required to generate valid HTML. | |
| ('>', '>'), | |
| ('<', '<'), | |
| ('&', '&'), | |
| ): | |
| template = '3 %s 4' | |
| raw = template % u | |
| with_entities = template % entity | |
| assert self.sub.substitute_html(raw) == with_entities | |
| def test_html5_entity_with_variation_selector(self): | |
| # Some HTML5 entities correspond either to a single-character | |
| # Unicode sequence _or_ to the same character plus U+FE00, | |
| # VARIATION SELECTOR 1. We can handle this. | |
| data = "fjords \u2294 penguins" | |
| markup = "fjords ⊔ penguins" | |
| assert self.sub.substitute_html(data) == markup | |
| data = "fjords \u2294\ufe00 penguins" | |
| markup = "fjords ⊔︀ penguins" | |
| assert self.sub.substitute_html(data) == markup | |
| def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): | |
| s = 'Welcome to "my bar"' | |
| assert self.sub.substitute_xml(s, False) == s | |
| def test_xml_attribute_quoting_normally_uses_double_quotes(self): | |
| assert self.sub.substitute_xml("Welcome", True) == '"Welcome"' | |
| assert self.sub.substitute_xml("Bob's Bar", True) == '"Bob\'s Bar"' | |
| def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): | |
| s = 'Welcome to "my bar"' | |
| assert self.sub.substitute_xml(s, True) == "'Welcome to \"my bar\"'" | |
| def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): | |
| s = 'Welcome to "Bob\'s Bar"' | |
| assert self.sub.substitute_xml(s, True) == '"Welcome to "Bob\'s Bar""' | |
| def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): | |
| quoted = 'Welcome to "Bob\'s Bar"' | |
| assert self.sub.substitute_xml(quoted) == quoted | |
| def test_xml_quoting_handles_angle_brackets(self): | |
| assert self.sub.substitute_xml("foo<bar>") == "foo<bar>" | |
| def test_xml_quoting_handles_ampersands(self): | |
| assert self.sub.substitute_xml("AT&T") == "AT&T" | |
| def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): | |
| assert self.sub.substitute_xml("ÁT&T") == "&Aacute;T&T" | |
| def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): | |
| assert self.sub.substitute_xml_containing_entities("ÁT&T") == "ÁT&T" | |
| def test_quotes_not_html_substituted(self): | |
| """There's no need to do this except inside attribute values.""" | |
| text = 'Bob\'s "bar"' | |
| assert self.sub.substitute_html(text) == text | |