|
|
|
|
|
"""Beautiful Soup bonus library: Unicode, Dammit |
|
|
|
|
|
This library converts a bytestream to Unicode through any means |
|
|
necessary. It is heavily based on code from Mark Pilgrim's `Universal |
|
|
Feed Parser <https://pypi.org/project/feedparser/>`_, now maintained |
|
|
by Kurt McKee. It does not rewrite the body of an XML or HTML document |
|
|
to reflect a new encoding; that's the job of `TreeBuilder`. |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
__license__ = "MIT" |
|
|
|
|
|
from html.entities import codepoint2name |
|
|
from collections import defaultdict |
|
|
import codecs |
|
|
from html.entities import html5 |
|
|
import re |
|
|
from logging import Logger, getLogger |
|
|
from types import ModuleType |
|
|
from typing import ( |
|
|
Dict, |
|
|
Iterator, |
|
|
List, |
|
|
Optional, |
|
|
Pattern, |
|
|
Set, |
|
|
Tuple, |
|
|
Type, |
|
|
Union, |
|
|
cast, |
|
|
) |
|
|
from typing_extensions import Literal |
|
|
from bs4._typing import ( |
|
|
_Encoding, |
|
|
_Encodings, |
|
|
) |
|
|
import warnings |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chardet_module: Optional[ModuleType] = None |
|
|
try: |
|
|
|
|
|
import cchardet |
|
|
|
|
|
chardet_module = cchardet |
|
|
except ImportError: |
|
|
try: |
|
|
|
|
|
|
|
|
import chardet |
|
|
|
|
|
chardet_module = chardet |
|
|
except ImportError: |
|
|
try: |
|
|
|
|
|
import charset_normalizer |
|
|
|
|
|
chardet_module = charset_normalizer |
|
|
except ImportError: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
def _chardet_dammit(s: bytes) -> Optional[str]: |
|
|
"""Try as hard as possible to detect the encoding of a bytestring.""" |
|
|
if chardet_module is None or isinstance(s, str): |
|
|
return None |
|
|
module = chardet_module |
|
|
return module.detect(s)["encoding"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
xml_encoding: str = "^\\s*<\\?.*encoding=['\"](.*?)['\"].*\\?>" |
|
|
html_meta: str = ( |
|
|
"<\\s*meta[^>]+charset\\s*=\\s*[\"']?([^>]*?)[ /;'\">]" |
|
|
) |
|
|
|
|
|
|
|
|
encoding_res: Dict[Type, Dict[str, Pattern]] = dict() |
|
|
encoding_res[bytes] = { |
|
|
"html": re.compile(html_meta.encode("ascii"), re.I), |
|
|
"xml": re.compile(xml_encoding.encode("ascii"), re.I), |
|
|
} |
|
|
encoding_res[str] = { |
|
|
"html": re.compile(html_meta, re.I), |
|
|
"xml": re.compile(xml_encoding, re.I), |
|
|
} |
|
|
|
|
|
|
|
|
class EntitySubstitution(object): |
|
|
"""The ability to substitute XML or HTML entities for certain characters.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HTML_ENTITY_TO_CHARACTER: Dict[str, str] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHARACTER_TO_HTML_ENTITY: Dict[str, str] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHARACTER_TO_HTML_ENTITY_RE: Pattern[str] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: Pattern[str] |
|
|
|
|
|
@classmethod |
|
|
def _populate_class_variables(cls) -> None: |
|
|
"""Initialize variables used by this class to manage the plethora of |
|
|
HTML5 named entities. |
|
|
|
|
|
This function sets the following class variables: |
|
|
|
|
|
CHARACTER_TO_HTML_ENTITY - A mapping of Unicode strings like "⦨" to |
|
|
entity names like "angmsdaa". When a single Unicode string has |
|
|
multiple entity names, we try to choose the most commonly-used |
|
|
name. |
|
|
|
|
|
HTML_ENTITY_TO_CHARACTER: A mapping of entity names like "angmsdaa" to |
|
|
Unicode strings like "⦨". |
|
|
|
|
|
CHARACTER_TO_HTML_ENTITY_RE: A regular expression matching (almost) any |
|
|
Unicode string that corresponds to an HTML5 named entity. |
|
|
|
|
|
CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE: A very similar |
|
|
regular expression to CHARACTER_TO_HTML_ENTITY_RE, but which |
|
|
also matches unescaped ampersands. This is used by the 'html' |
|
|
formatted to provide backwards-compatibility, even though the HTML5 |
|
|
spec allows most ampersands to go unescaped. |
|
|
""" |
|
|
unicode_to_name = {} |
|
|
name_to_unicode = {} |
|
|
|
|
|
short_entities = set() |
|
|
long_entities_by_first_character = defaultdict(set) |
|
|
|
|
|
for name_with_semicolon, character in sorted(html5.items()): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if name_with_semicolon.endswith(";"): |
|
|
name = name_with_semicolon[:-1] |
|
|
else: |
|
|
name = name_with_semicolon |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if name not in name_to_unicode: |
|
|
name_to_unicode[name] = character |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
unicode_to_name[character] = name |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(character) == 1 and ord(character) < 128 and character not in "<>": |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue |
|
|
|
|
|
if len(character) > 1 and all(ord(x) < 128 for x in character): |
|
|
|
|
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(character) == 1 and character != "&": |
|
|
short_entities.add(character) |
|
|
else: |
|
|
long_entities_by_first_character[character[0]].add(character) |
|
|
|
|
|
|
|
|
|
|
|
particles = set() |
|
|
for short in short_entities: |
|
|
long_versions = long_entities_by_first_character[short] |
|
|
if not long_versions: |
|
|
particles.add(short) |
|
|
else: |
|
|
ignore = "".join([x[1] for x in long_versions]) |
|
|
|
|
|
|
|
|
particles.add("%s(?![%s])" % (short, ignore)) |
|
|
|
|
|
for long_entities in list(long_entities_by_first_character.values()): |
|
|
for long_entity in long_entities: |
|
|
particles.add(long_entity) |
|
|
|
|
|
re_definition = "(%s)" % "|".join(particles) |
|
|
|
|
|
particles.add("&") |
|
|
re_definition_with_ampersand = "(%s)" % "|".join(particles) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for codepoint, name in list(codepoint2name.items()): |
|
|
character = chr(codepoint) |
|
|
unicode_to_name[character] = name |
|
|
|
|
|
cls.CHARACTER_TO_HTML_ENTITY = unicode_to_name |
|
|
cls.HTML_ENTITY_TO_CHARACTER = name_to_unicode |
|
|
cls.CHARACTER_TO_HTML_ENTITY_RE = re.compile(re_definition) |
|
|
cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE = re.compile( |
|
|
re_definition_with_ampersand |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHARACTER_TO_XML_ENTITY: Dict[str, str] = { |
|
|
"'": "apos", |
|
|
'"': "quot", |
|
|
"&": "amp", |
|
|
"<": "lt", |
|
|
">": "gt", |
|
|
} |
|
|
|
|
|
|
|
|
ANY_ENTITY_RE = re.compile("&(#\\d+|#x[0-9a-fA-F]+|\\w+);", re.I) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BARE_AMPERSAND_OR_BRACKET: Pattern[str] = re.compile( |
|
|
"([<>]|" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AMPERSAND_OR_BRACKET: Pattern[str] = re.compile("([<>&])") |
|
|
|
|
|
@classmethod |
|
|
def _substitute_html_entity(cls, matchobj: re.Match) -> str: |
|
|
"""Used with a regular expression to substitute the |
|
|
appropriate HTML entity for a special character string.""" |
|
|
original_entity = matchobj.group(0) |
|
|
entity = cls.CHARACTER_TO_HTML_ENTITY.get(original_entity) |
|
|
if entity is None: |
|
|
return "&%s;" % original_entity |
|
|
return "&%s;" % entity |
|
|
|
|
|
@classmethod |
|
|
def _substitute_xml_entity(cls, matchobj: re.Match) -> str: |
|
|
"""Used with a regular expression to substitute the |
|
|
appropriate XML entity for a special character string.""" |
|
|
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] |
|
|
return "&%s;" % entity |
|
|
|
|
|
@classmethod |
|
|
def _escape_entity_name(cls, matchobj: re.Match) -> str: |
|
|
return "&%s;" % matchobj.group(1) |
|
|
|
|
|
@classmethod |
|
|
def _escape_unrecognized_entity_name(cls, matchobj: re.Match) -> str: |
|
|
possible_entity = matchobj.group(1) |
|
|
if possible_entity in cls.HTML_ENTITY_TO_CHARACTER: |
|
|
return "&%s;" % possible_entity |
|
|
return "&%s;" % possible_entity |
|
|
|
|
|
@classmethod |
|
|
def quoted_attribute_value(cls, value: str) -> str: |
|
|
"""Make a value into a quoted XML attribute, possibly escaping it. |
|
|
|
|
|
Most strings will be quoted using double quotes. |
|
|
|
|
|
Bob's Bar -> "Bob's Bar" |
|
|
|
|
|
If a string contains double quotes, it will be quoted using |
|
|
single quotes. |
|
|
|
|
|
Welcome to "my bar" -> 'Welcome to "my bar"' |
|
|
|
|
|
If a string contains both single and double quotes, the |
|
|
double quotes will be escaped, and the string will be quoted |
|
|
using double quotes. |
|
|
|
|
|
Welcome to "Bob's Bar" -> Welcome to "Bob's bar" |
|
|
|
|
|
:param value: The XML attribute value to quote |
|
|
:return: The quoted value |
|
|
""" |
|
|
quote_with = '"' |
|
|
if '"' in value: |
|
|
if "'" in value: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
replace_with = """ |
|
|
value = value.replace('"', replace_with) |
|
|
else: |
|
|
|
|
|
|
|
|
quote_with = "'" |
|
|
return quote_with + value + quote_with |
|
|
|
|
|
@classmethod |
|
|
def substitute_xml(cls, value: str, make_quoted_attribute: bool = False) -> str: |
|
|
"""Replace special XML characters with named XML entities. |
|
|
|
|
|
The less-than sign will become <, the greater-than sign |
|
|
will become >, and any ampersands will become &. If you |
|
|
want ampersands that seem to be part of an entity definition |
|
|
to be left alone, use `substitute_xml_containing_entities` |
|
|
instead. |
|
|
|
|
|
:param value: A string to be substituted. |
|
|
|
|
|
:param make_quoted_attribute: If True, then the string will be |
|
|
quoted, as befits an attribute value. |
|
|
|
|
|
:return: A version of ``value`` with special characters replaced |
|
|
with named entities. |
|
|
""" |
|
|
|
|
|
value = cls.AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value) |
|
|
|
|
|
if make_quoted_attribute: |
|
|
value = cls.quoted_attribute_value(value) |
|
|
return value |
|
|
|
|
|
@classmethod |
|
|
def substitute_xml_containing_entities( |
|
|
cls, value: str, make_quoted_attribute: bool = False |
|
|
) -> str: |
|
|
"""Substitute XML entities for special XML characters. |
|
|
|
|
|
:param value: A string to be substituted. The less-than sign will |
|
|
become <, the greater-than sign will become >, and any |
|
|
ampersands that are not part of an entity defition will |
|
|
become &. |
|
|
|
|
|
:param make_quoted_attribute: If True, then the string will be |
|
|
quoted, as befits an attribute value. |
|
|
""" |
|
|
|
|
|
|
|
|
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(cls._substitute_xml_entity, value) |
|
|
|
|
|
if make_quoted_attribute: |
|
|
value = cls.quoted_attribute_value(value) |
|
|
return value |
|
|
|
|
|
@classmethod |
|
|
def substitute_html(cls, s: str) -> str: |
|
|
"""Replace certain Unicode characters with named HTML entities. |
|
|
|
|
|
This differs from ``data.encode(encoding, 'xmlcharrefreplace')`` |
|
|
in that the goal is to make the result more readable (to those |
|
|
with ASCII displays) rather than to recover from |
|
|
errors. There's absolutely nothing wrong with a UTF-8 string |
|
|
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that |
|
|
character with "é" will make it more readable to some |
|
|
people. |
|
|
|
|
|
:param s: The string to be modified. |
|
|
:return: The string with some Unicode characters replaced with |
|
|
HTML entities. |
|
|
""" |
|
|
|
|
|
return cls.CHARACTER_TO_HTML_ENTITY_WITH_AMPERSAND_RE.sub( |
|
|
cls._substitute_html_entity, s |
|
|
) |
|
|
|
|
|
@classmethod |
|
|
def substitute_html5(cls, s: str) -> str: |
|
|
"""Replace certain Unicode characters with named HTML entities |
|
|
using HTML5 rules. |
|
|
|
|
|
Specifically, this method is much less aggressive about |
|
|
escaping ampersands than substitute_html. Only ambiguous |
|
|
ampersands are escaped, per the HTML5 standard: |
|
|
|
|
|
"An ambiguous ampersand is a U+0026 AMPERSAND character (&) |
|
|
that is followed by one or more ASCII alphanumerics, followed |
|
|
by a U+003B SEMICOLON character (;), where these characters do |
|
|
not match any of the names given in the named character |
|
|
references section." |
|
|
|
|
|
Unlike substitute_html5_raw, this method assumes HTML entities |
|
|
were converted to Unicode characters on the way in, as |
|
|
Beautiful Soup does. By the time Beautiful Soup does its work, |
|
|
the only ambiguous ampersands that need to be escaped are the |
|
|
ones that were escaped in the original markup when mentioning |
|
|
HTML entities. |
|
|
|
|
|
:param s: The string to be modified. |
|
|
:return: The string with some Unicode characters replaced with |
|
|
HTML entities. |
|
|
""" |
|
|
|
|
|
s = cls.ANY_ENTITY_RE.sub(cls._escape_entity_name, s) |
|
|
|
|
|
|
|
|
s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s) |
|
|
|
|
|
return s |
|
|
|
|
|
@classmethod |
|
|
def substitute_html5_raw(cls, s: str) -> str: |
|
|
"""Replace certain Unicode characters with named HTML entities |
|
|
using HTML5 rules. |
|
|
|
|
|
substitute_html5_raw is similar to substitute_html5 but it is |
|
|
designed for standalone use (whereas substitute_html5 is |
|
|
designed for use with Beautiful Soup). |
|
|
|
|
|
:param s: The string to be modified. |
|
|
:return: The string with some Unicode characters replaced with |
|
|
HTML entities. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
s = cls.ANY_ENTITY_RE.sub(cls._escape_unrecognized_entity_name, s) |
|
|
|
|
|
|
|
|
|
|
|
s = cls.CHARACTER_TO_HTML_ENTITY_RE.sub(cls._substitute_html_entity, s) |
|
|
|
|
|
return s |
|
|
|
|
|
|
|
|
EntitySubstitution._populate_class_variables() |
|
|
|
|
|
|
|
|
class EncodingDetector: |
|
|
"""This class is capable of guessing a number of possible encodings |
|
|
for a bytestring. |
|
|
|
|
|
Order of precedence: |
|
|
|
|
|
1. Encodings you specifically tell EncodingDetector to try first |
|
|
(the ``known_definite_encodings`` argument to the constructor). |
|
|
|
|
|
2. An encoding determined by sniffing the document's byte-order mark. |
|
|
|
|
|
3. Encodings you specifically tell EncodingDetector to try if |
|
|
byte-order mark sniffing fails (the ``user_encodings`` argument to the |
|
|
constructor). |
|
|
|
|
|
4. An encoding declared within the bytestring itself, either in an |
|
|
XML declaration (if the bytestring is to be interpreted as an XML |
|
|
document), or in a <meta> tag (if the bytestring is to be |
|
|
interpreted as an HTML document.) |
|
|
|
|
|
5. An encoding detected through textual analysis by chardet, |
|
|
cchardet, or a similar external library. |
|
|
|
|
|
6. UTF-8. |
|
|
|
|
|
7. Windows-1252. |
|
|
|
|
|
:param markup: Some markup in an unknown encoding. |
|
|
|
|
|
:param known_definite_encodings: When determining the encoding |
|
|
of ``markup``, these encodings will be tried first, in |
|
|
order. In HTML terms, this corresponds to the "known |
|
|
definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_. |
|
|
|
|
|
:param user_encodings: These encodings will be tried after the |
|
|
``known_definite_encodings`` have been tried and failed, and |
|
|
after an attempt to sniff the encoding by looking at a |
|
|
byte order mark has failed. In HTML terms, this |
|
|
corresponds to the step "user has explicitly instructed |
|
|
the user agent to override the document's character |
|
|
encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_. |
|
|
|
|
|
:param override_encodings: A **deprecated** alias for |
|
|
``known_definite_encodings``. Any encodings here will be tried |
|
|
immediately after the encodings in |
|
|
``known_definite_encodings``. |
|
|
|
|
|
:param is_html: If True, this markup is considered to be |
|
|
HTML. Otherwise it's assumed to be XML. |
|
|
|
|
|
:param exclude_encodings: These encodings will not be tried, |
|
|
even if they otherwise would be. |
|
|
|
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
markup: bytes, |
|
|
known_definite_encodings: Optional[_Encodings] = None, |
|
|
is_html: Optional[bool] = False, |
|
|
exclude_encodings: Optional[_Encodings] = None, |
|
|
user_encodings: Optional[_Encodings] = None, |
|
|
override_encodings: Optional[_Encodings] = None, |
|
|
): |
|
|
self.known_definite_encodings = list(known_definite_encodings or []) |
|
|
if override_encodings: |
|
|
warnings.warn( |
|
|
"The 'override_encodings' argument was deprecated in 4.10.0. Use 'known_definite_encodings' instead.", |
|
|
DeprecationWarning, |
|
|
stacklevel=3, |
|
|
) |
|
|
self.known_definite_encodings += override_encodings |
|
|
self.user_encodings = user_encodings or [] |
|
|
exclude_encodings = exclude_encodings or [] |
|
|
self.exclude_encodings = set([x.lower() for x in exclude_encodings]) |
|
|
self.chardet_encoding = None |
|
|
self.is_html = False if is_html is None else is_html |
|
|
self.declared_encoding: Optional[str] = None |
|
|
|
|
|
|
|
|
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) |
|
|
|
|
|
known_definite_encodings: _Encodings |
|
|
user_encodings: _Encodings |
|
|
exclude_encodings: _Encodings |
|
|
chardet_encoding: Optional[_Encoding] |
|
|
is_html: bool |
|
|
declared_encoding: Optional[_Encoding] |
|
|
markup: bytes |
|
|
sniffed_encoding: Optional[_Encoding] |
|
|
|
|
|
def _usable(self, encoding: Optional[_Encoding], tried: Set[_Encoding]) -> bool: |
|
|
"""Should we even bother to try this encoding? |
|
|
|
|
|
:param encoding: Name of an encoding. |
|
|
:param tried: Encodings that have already been tried. This |
|
|
will be modified as a side effect. |
|
|
""" |
|
|
if encoding is None: |
|
|
return False |
|
|
encoding = encoding.lower() |
|
|
if encoding in self.exclude_encodings: |
|
|
return False |
|
|
if encoding not in tried: |
|
|
tried.add(encoding) |
|
|
return True |
|
|
return False |
|
|
|
|
|
@property |
|
|
def encodings(self) -> Iterator[_Encoding]: |
|
|
"""Yield a number of encodings that might work for this markup. |
|
|
|
|
|
:yield: A sequence of strings. Each is the name of an encoding |
|
|
that *might* work to convert a bytestring into Unicode. |
|
|
""" |
|
|
tried: Set[_Encoding] = set() |
|
|
|
|
|
|
|
|
for e in self.known_definite_encodings: |
|
|
if self._usable(e, tried): |
|
|
yield e |
|
|
|
|
|
|
|
|
|
|
|
if self.sniffed_encoding is not None and self._usable( |
|
|
self.sniffed_encoding, tried |
|
|
): |
|
|
yield self.sniffed_encoding |
|
|
|
|
|
|
|
|
|
|
|
for e in self.user_encodings: |
|
|
if self._usable(e, tried): |
|
|
yield e |
|
|
|
|
|
|
|
|
|
|
|
if self.declared_encoding is None: |
|
|
self.declared_encoding = self.find_declared_encoding( |
|
|
self.markup, self.is_html |
|
|
) |
|
|
if self.declared_encoding is not None and self._usable( |
|
|
self.declared_encoding, tried |
|
|
): |
|
|
yield self.declared_encoding |
|
|
|
|
|
|
|
|
|
|
|
if self.chardet_encoding is None: |
|
|
self.chardet_encoding = _chardet_dammit(self.markup) |
|
|
if self.chardet_encoding is not None and self._usable( |
|
|
self.chardet_encoding, tried |
|
|
): |
|
|
yield self.chardet_encoding |
|
|
|
|
|
|
|
|
for e in ("utf-8", "windows-1252"): |
|
|
if self._usable(e, tried): |
|
|
yield e |
|
|
|
|
|
@classmethod |
|
|
def strip_byte_order_mark(cls, data: bytes) -> Tuple[bytes, Optional[_Encoding]]: |
|
|
"""If a byte-order mark is present, strip it and return the encoding it implies. |
|
|
|
|
|
:param data: A bytestring that may or may not begin with a |
|
|
byte-order mark. |
|
|
|
|
|
:return: A 2-tuple (data stripped of byte-order mark, encoding implied by byte-order mark) |
|
|
""" |
|
|
encoding = None |
|
|
if isinstance(data, str): |
|
|
|
|
|
return data, encoding |
|
|
if ( |
|
|
(len(data) >= 4) |
|
|
and (data[:2] == b"\xfe\xff") |
|
|
and (data[2:4] != b"\x00\x00") |
|
|
): |
|
|
encoding = "utf-16be" |
|
|
data = data[2:] |
|
|
elif ( |
|
|
(len(data) >= 4) |
|
|
and (data[:2] == b"\xff\xfe") |
|
|
and (data[2:4] != b"\x00\x00") |
|
|
): |
|
|
encoding = "utf-16le" |
|
|
data = data[2:] |
|
|
elif data[:3] == b"\xef\xbb\xbf": |
|
|
encoding = "utf-8" |
|
|
data = data[3:] |
|
|
elif data[:4] == b"\x00\x00\xfe\xff": |
|
|
encoding = "utf-32be" |
|
|
data = data[4:] |
|
|
elif data[:4] == b"\xff\xfe\x00\x00": |
|
|
encoding = "utf-32le" |
|
|
data = data[4:] |
|
|
return data, encoding |
|
|
|
|
|
@classmethod |
|
|
def find_declared_encoding( |
|
|
cls, |
|
|
markup: Union[bytes, str], |
|
|
is_html: bool = False, |
|
|
search_entire_document: bool = False, |
|
|
) -> Optional[_Encoding]: |
|
|
"""Given a document, tries to find an encoding declared within the |
|
|
text of the document itself. |
|
|
|
|
|
An XML encoding is declared at the beginning of the document. |
|
|
|
|
|
An HTML encoding is declared in a <meta> tag, hopefully near the |
|
|
beginning of the document. |
|
|
|
|
|
:param markup: Some markup. |
|
|
:param is_html: If True, this markup is considered to be HTML. Otherwise |
|
|
it's assumed to be XML. |
|
|
:param search_entire_document: Since an encoding is supposed |
|
|
to declared near the beginning of the document, most of |
|
|
the time it's only necessary to search a few kilobytes of |
|
|
data. Set this to True to force this method to search the |
|
|
entire document. |
|
|
:return: The declared encoding, if one is found. |
|
|
""" |
|
|
if search_entire_document: |
|
|
xml_endpos = html_endpos = len(markup) |
|
|
else: |
|
|
xml_endpos = 1024 |
|
|
html_endpos = max(2048, int(len(markup) * 0.05)) |
|
|
|
|
|
if isinstance(markup, bytes): |
|
|
res = encoding_res[bytes] |
|
|
else: |
|
|
res = encoding_res[str] |
|
|
|
|
|
xml_re = res["xml"] |
|
|
html_re = res["html"] |
|
|
declared_encoding: Optional[_Encoding] = None |
|
|
declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) |
|
|
if not declared_encoding_match and is_html: |
|
|
declared_encoding_match = html_re.search(markup, endpos=html_endpos) |
|
|
if declared_encoding_match is not None: |
|
|
declared_encoding = declared_encoding_match.groups()[0] |
|
|
if declared_encoding: |
|
|
if isinstance(declared_encoding, bytes): |
|
|
declared_encoding = declared_encoding.decode("ascii", "replace") |
|
|
return declared_encoding.lower() |
|
|
return None |
|
|
|
|
|
|
|
|
class UnicodeDammit: |
|
|
"""A class for detecting the encoding of a bytestring containing an |
|
|
HTML or XML document, and decoding it to Unicode. If the source |
|
|
encoding is windows-1252, `UnicodeDammit` can also replace |
|
|
Microsoft smart quotes with their HTML or XML equivalents. |
|
|
|
|
|
:param markup: HTML or XML markup in an unknown encoding. |
|
|
|
|
|
:param known_definite_encodings: When determining the encoding |
|
|
of ``markup``, these encodings will be tried first, in |
|
|
order. In HTML terms, this corresponds to the "known |
|
|
definite encoding" step defined in `section 13.2.3.1 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding>`_. |
|
|
|
|
|
:param user_encodings: These encodings will be tried after the |
|
|
``known_definite_encodings`` have been tried and failed, and |
|
|
after an attempt to sniff the encoding by looking at a |
|
|
byte order mark has failed. In HTML terms, this |
|
|
corresponds to the step "user has explicitly instructed |
|
|
the user agent to override the document's character |
|
|
encoding", defined in `section 13.2.3.2 of the HTML standard <https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding>`_. |
|
|
|
|
|
:param override_encodings: A **deprecated** alias for |
|
|
``known_definite_encodings``. Any encodings here will be tried |
|
|
immediately after the encodings in |
|
|
``known_definite_encodings``. |
|
|
|
|
|
:param smart_quotes_to: By default, Microsoft smart quotes will, |
|
|
like all other characters, be converted to Unicode |
|
|
characters. Setting this to ``ascii`` will convert them to ASCII |
|
|
quotes instead. Setting it to ``xml`` will convert them to XML |
|
|
entity references, and setting it to ``html`` will convert them |
|
|
to HTML entity references. |
|
|
|
|
|
:param is_html: If True, ``markup`` is treated as an HTML |
|
|
document. Otherwise it's treated as an XML document. |
|
|
|
|
|
:param exclude_encodings: These encodings will not be considered, |
|
|
even if the sniffing code thinks they might make sense. |
|
|
|
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
markup: bytes, |
|
|
known_definite_encodings: Optional[_Encodings] = [], |
|
|
smart_quotes_to: Optional[Literal["ascii", "xml", "html"]] = None, |
|
|
is_html: bool = False, |
|
|
exclude_encodings: Optional[_Encodings] = [], |
|
|
user_encodings: Optional[_Encodings] = None, |
|
|
override_encodings: Optional[_Encodings] = None, |
|
|
): |
|
|
self.smart_quotes_to = smart_quotes_to |
|
|
self.tried_encodings = [] |
|
|
self.contains_replacement_characters = False |
|
|
self.is_html = is_html |
|
|
self.log = getLogger(__name__) |
|
|
self.detector = EncodingDetector( |
|
|
markup, |
|
|
known_definite_encodings, |
|
|
is_html, |
|
|
exclude_encodings, |
|
|
user_encodings, |
|
|
override_encodings, |
|
|
) |
|
|
|
|
|
|
|
|
if isinstance(markup, str) or markup == b"": |
|
|
self.markup = markup |
|
|
self.unicode_markup = str(markup) |
|
|
self.original_encoding = None |
|
|
return |
|
|
|
|
|
|
|
|
|
|
|
self.markup = self.detector.markup |
|
|
|
|
|
u = None |
|
|
for encoding in self.detector.encodings: |
|
|
markup = self.detector.markup |
|
|
u = self._convert_from(encoding) |
|
|
if u is not None: |
|
|
break |
|
|
|
|
|
if not u: |
|
|
|
|
|
|
|
|
|
|
|
for encoding in self.detector.encodings: |
|
|
if encoding != "ascii": |
|
|
u = self._convert_from(encoding, "replace") |
|
|
if u is not None: |
|
|
self.log.warning( |
|
|
"Some characters could not be decoded, and were " |
|
|
"replaced with REPLACEMENT CHARACTER." |
|
|
) |
|
|
|
|
|
self.contains_replacement_characters = True |
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if u is None: |
|
|
self.original_encoding = None |
|
|
self.unicode_markup = None |
|
|
else: |
|
|
self.unicode_markup = u |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
markup: bytes |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
unicode_markup: Optional[str] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
contains_replacement_characters: bool |
|
|
|
|
|
|
|
|
|
|
|
original_encoding: Optional[_Encoding] |
|
|
|
|
|
|
|
|
smart_quotes_to: Optional[str] |
|
|
|
|
|
|
|
|
|
|
|
tried_encodings: List[Tuple[_Encoding, str]] |
|
|
|
|
|
log: Logger |
|
|
|
|
|
def _sub_ms_char(self, match: re.Match) -> bytes: |
|
|
"""Changes a MS smart quote character to an XML or HTML |
|
|
entity, or an ASCII character. |
|
|
|
|
|
TODO: Since this is only used to convert smart quotes, it |
|
|
could be simplified, and MS_CHARS_TO_ASCII made much less |
|
|
parochial. |
|
|
""" |
|
|
orig: bytes = match.group(1) |
|
|
sub: bytes |
|
|
if self.smart_quotes_to == "ascii": |
|
|
if orig in self.MS_CHARS_TO_ASCII: |
|
|
sub = self.MS_CHARS_TO_ASCII[orig].encode() |
|
|
else: |
|
|
|
|
|
|
|
|
sub = orig |
|
|
else: |
|
|
if orig in self.MS_CHARS: |
|
|
substitutions = self.MS_CHARS[orig] |
|
|
if type(substitutions) is tuple: |
|
|
if self.smart_quotes_to == "xml": |
|
|
sub = b"&#x" + substitutions[1].encode() + b";" |
|
|
else: |
|
|
sub = b"&" + substitutions[0].encode() + b";" |
|
|
else: |
|
|
substitutions = cast(str, substitutions) |
|
|
sub = substitutions.encode() |
|
|
else: |
|
|
|
|
|
|
|
|
sub = orig |
|
|
return sub |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHARSET_ALIASES: Dict[str, _Encoding] = { |
|
|
"macintosh": "mac-roman", |
|
|
"x-sjis": "shift-jis", |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ENCODINGS_WITH_SMART_QUOTES: _Encodings = [ |
|
|
"windows-1252", |
|
|
"iso-8859-1", |
|
|
"iso-8859-2", |
|
|
] |
|
|
|
|
|
def _convert_from( |
|
|
self, proposed: _Encoding, errors: str = "strict" |
|
|
) -> Optional[str]: |
|
|
"""Attempt to convert the markup to the proposed encoding. |
|
|
|
|
|
:param proposed: The name of a character encoding. |
|
|
:param errors: An error handling strategy, used when calling `str`. |
|
|
:return: The converted markup, or `None` if the proposed |
|
|
encoding/error handling strategy didn't work. |
|
|
""" |
|
|
lookup_result = self.find_codec(proposed) |
|
|
if lookup_result is None or (lookup_result, errors) in self.tried_encodings: |
|
|
return None |
|
|
proposed = lookup_result |
|
|
self.tried_encodings.append((proposed, errors)) |
|
|
markup = self.markup |
|
|
|
|
|
|
|
|
if ( |
|
|
self.smart_quotes_to is not None |
|
|
and proposed in self.ENCODINGS_WITH_SMART_QUOTES |
|
|
): |
|
|
smart_quotes_re = b"([\x80-\x9f])" |
|
|
smart_quotes_compiled = re.compile(smart_quotes_re) |
|
|
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) |
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
u = self._to_unicode(markup, proposed, errors) |
|
|
self.unicode_markup = u |
|
|
self.original_encoding = proposed |
|
|
except Exception: |
|
|
|
|
|
|
|
|
return None |
|
|
|
|
|
return self.unicode_markup |
|
|
|
|
|
def _to_unicode( |
|
|
self, data: bytes, encoding: _Encoding, errors: str = "strict" |
|
|
) -> str: |
|
|
"""Given a bytestring and its encoding, decodes the string into Unicode. |
|
|
|
|
|
:param encoding: The name of an encoding. |
|
|
:param errors: An error handling strategy, used when calling `str`. |
|
|
""" |
|
|
return str(data, encoding, errors) |
|
|
|
|
|
@property |
|
|
def declared_html_encoding(self) -> Optional[_Encoding]: |
|
|
"""If the markup is an HTML document, returns the encoding, if any, |
|
|
declared *inside* the document. |
|
|
""" |
|
|
if not self.is_html: |
|
|
return None |
|
|
return self.detector.declared_encoding |
|
|
|
|
|
def find_codec(self, charset: _Encoding) -> Optional[str]: |
|
|
"""Look up the Python codec corresponding to a given character set. |
|
|
|
|
|
:param charset: The name of a character set. |
|
|
:return: The name of a Python codec. |
|
|
""" |
|
|
value = ( |
|
|
self._codec(self.CHARSET_ALIASES.get(charset, charset)) |
|
|
or (charset and self._codec(charset.replace("-", ""))) |
|
|
or (charset and self._codec(charset.replace("-", "_"))) |
|
|
or (charset and charset.lower()) |
|
|
or charset |
|
|
) |
|
|
if value: |
|
|
return value.lower() |
|
|
return None |
|
|
|
|
|
def _codec(self, charset: _Encoding) -> Optional[str]: |
|
|
if not charset: |
|
|
return charset |
|
|
codec = None |
|
|
try: |
|
|
codecs.lookup(charset) |
|
|
codec = charset |
|
|
except (LookupError, ValueError): |
|
|
pass |
|
|
return codec |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MS_CHARS: Dict[bytes, Union[str, Tuple[str, str]]] = { |
|
|
b"\x80": ("euro", "20AC"), |
|
|
b"\x81": " ", |
|
|
b"\x82": ("sbquo", "201A"), |
|
|
b"\x83": ("fnof", "192"), |
|
|
b"\x84": ("bdquo", "201E"), |
|
|
b"\x85": ("hellip", "2026"), |
|
|
b"\x86": ("dagger", "2020"), |
|
|
b"\x87": ("Dagger", "2021"), |
|
|
b"\x88": ("circ", "2C6"), |
|
|
b"\x89": ("permil", "2030"), |
|
|
b"\x8a": ("Scaron", "160"), |
|
|
b"\x8b": ("lsaquo", "2039"), |
|
|
b"\x8c": ("OElig", "152"), |
|
|
b"\x8d": "?", |
|
|
b"\x8e": ("#x17D", "17D"), |
|
|
b"\x8f": "?", |
|
|
b"\x90": "?", |
|
|
b"\x91": ("lsquo", "2018"), |
|
|
b"\x92": ("rsquo", "2019"), |
|
|
b"\x93": ("ldquo", "201C"), |
|
|
b"\x94": ("rdquo", "201D"), |
|
|
b"\x95": ("bull", "2022"), |
|
|
b"\x96": ("ndash", "2013"), |
|
|
b"\x97": ("mdash", "2014"), |
|
|
b"\x98": ("tilde", "2DC"), |
|
|
b"\x99": ("trade", "2122"), |
|
|
b"\x9a": ("scaron", "161"), |
|
|
b"\x9b": ("rsaquo", "203A"), |
|
|
b"\x9c": ("oelig", "153"), |
|
|
b"\x9d": "?", |
|
|
b"\x9e": ("#x17E", "17E"), |
|
|
b"\x9f": ("Yuml", ""), |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MS_CHARS_TO_ASCII: Dict[bytes, str] = { |
|
|
b"\x80": "EUR", |
|
|
b"\x81": " ", |
|
|
b"\x82": ",", |
|
|
b"\x83": "f", |
|
|
b"\x84": ",,", |
|
|
b"\x85": "...", |
|
|
b"\x86": "+", |
|
|
b"\x87": "++", |
|
|
b"\x88": "^", |
|
|
b"\x89": "%", |
|
|
b"\x8a": "S", |
|
|
b"\x8b": "<", |
|
|
b"\x8c": "OE", |
|
|
b"\x8d": "?", |
|
|
b"\x8e": "Z", |
|
|
b"\x8f": "?", |
|
|
b"\x90": "?", |
|
|
b"\x91": "'", |
|
|
b"\x92": "'", |
|
|
b"\x93": '"', |
|
|
b"\x94": '"', |
|
|
b"\x95": "*", |
|
|
b"\x96": "-", |
|
|
b"\x97": "--", |
|
|
b"\x98": "~", |
|
|
b"\x99": "(TM)", |
|
|
b"\x9a": "s", |
|
|
b"\x9b": ">", |
|
|
b"\x9c": "oe", |
|
|
b"\x9d": "?", |
|
|
b"\x9e": "z", |
|
|
b"\x9f": "Y", |
|
|
b"\xa0": " ", |
|
|
b"\xa1": "!", |
|
|
b"\xa2": "c", |
|
|
b"\xa3": "GBP", |
|
|
b"\xa4": "$", |
|
|
|
|
|
b"\xa5": "YEN", |
|
|
b"\xa6": "|", |
|
|
b"\xa7": "S", |
|
|
b"\xa8": "..", |
|
|
b"\xa9": "", |
|
|
b"\xaa": "(th)", |
|
|
b"\xab": "<<", |
|
|
b"\xac": "!", |
|
|
b"\xad": " ", |
|
|
b"\xae": "(R)", |
|
|
b"\xaf": "-", |
|
|
b"\xb0": "o", |
|
|
b"\xb1": "+-", |
|
|
b"\xb2": "2", |
|
|
b"\xb3": "3", |
|
|
b"\xb4": "'", |
|
|
b"\xb5": "u", |
|
|
b"\xb6": "P", |
|
|
b"\xb7": "*", |
|
|
b"\xb8": ",", |
|
|
b"\xb9": "1", |
|
|
b"\xba": "(th)", |
|
|
b"\xbb": ">>", |
|
|
b"\xbc": "1/4", |
|
|
b"\xbd": "1/2", |
|
|
b"\xbe": "3/4", |
|
|
b"\xbf": "?", |
|
|
b"\xc0": "A", |
|
|
b"\xc1": "A", |
|
|
b"\xc2": "A", |
|
|
b"\xc3": "A", |
|
|
b"\xc4": "A", |
|
|
b"\xc5": "A", |
|
|
b"\xc6": "AE", |
|
|
b"\xc7": "C", |
|
|
b"\xc8": "E", |
|
|
b"\xc9": "E", |
|
|
b"\xca": "E", |
|
|
b"\xcb": "E", |
|
|
b"\xcc": "I", |
|
|
b"\xcd": "I", |
|
|
b"\xce": "I", |
|
|
b"\xcf": "I", |
|
|
b"\xd0": "D", |
|
|
b"\xd1": "N", |
|
|
b"\xd2": "O", |
|
|
b"\xd3": "O", |
|
|
b"\xd4": "O", |
|
|
b"\xd5": "O", |
|
|
b"\xd6": "O", |
|
|
b"\xd7": "*", |
|
|
b"\xd8": "O", |
|
|
b"\xd9": "U", |
|
|
b"\xda": "U", |
|
|
b"\xdb": "U", |
|
|
b"\xdc": "U", |
|
|
b"\xdd": "Y", |
|
|
b"\xde": "b", |
|
|
b"\xdf": "B", |
|
|
b"\xe0": "a", |
|
|
b"\xe1": "a", |
|
|
b"\xe2": "a", |
|
|
b"\xe3": "a", |
|
|
b"\xe4": "a", |
|
|
b"\xe5": "a", |
|
|
b"\xe6": "ae", |
|
|
b"\xe7": "c", |
|
|
b"\xe8": "e", |
|
|
b"\xe9": "e", |
|
|
b"\xea": "e", |
|
|
b"\xeb": "e", |
|
|
b"\xec": "i", |
|
|
b"\xed": "i", |
|
|
b"\xee": "i", |
|
|
b"\xef": "i", |
|
|
b"\xf0": "o", |
|
|
b"\xf1": "n", |
|
|
b"\xf2": "o", |
|
|
b"\xf3": "o", |
|
|
b"\xf4": "o", |
|
|
b"\xf5": "o", |
|
|
b"\xf6": "o", |
|
|
b"\xf7": "/", |
|
|
b"\xf8": "o", |
|
|
b"\xf9": "u", |
|
|
b"\xfa": "u", |
|
|
b"\xfb": "u", |
|
|
b"\xfc": "u", |
|
|
b"\xfd": "y", |
|
|
b"\xfe": "b", |
|
|
b"\xff": "y", |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
WINDOWS_1252_TO_UTF8: Dict[int, bytes] = { |
|
|
0x80: b"\xe2\x82\xac", |
|
|
0x82: b"\xe2\x80\x9a", |
|
|
0x83: b"\xc6\x92", |
|
|
0x84: b"\xe2\x80\x9e", |
|
|
0x85: b"\xe2\x80\xa6", |
|
|
0x86: b"\xe2\x80\xa0", |
|
|
0x87: b"\xe2\x80\xa1", |
|
|
0x88: b"\xcb\x86", |
|
|
0x89: b"\xe2\x80\xb0", |
|
|
0x8A: b"\xc5\xa0", |
|
|
0x8B: b"\xe2\x80\xb9", |
|
|
0x8C: b"\xc5\x92", |
|
|
0x8E: b"\xc5\xbd", |
|
|
0x91: b"\xe2\x80\x98", |
|
|
0x92: b"\xe2\x80\x99", |
|
|
0x93: b"\xe2\x80\x9c", |
|
|
0x94: b"\xe2\x80\x9d", |
|
|
0x95: b"\xe2\x80\xa2", |
|
|
0x96: b"\xe2\x80\x93", |
|
|
0x97: b"\xe2\x80\x94", |
|
|
0x98: b"\xcb\x9c", |
|
|
0x99: b"\xe2\x84\xa2", |
|
|
0x9A: b"\xc5\xa1", |
|
|
0x9B: b"\xe2\x80\xba", |
|
|
0x9C: b"\xc5\x93", |
|
|
0x9E: b"\xc5\xbe", |
|
|
0x9F: b"\xc5\xb8", |
|
|
0xA0: b"\xc2\xa0", |
|
|
0xA1: b"\xc2\xa1", |
|
|
0xA2: b"\xc2\xa2", |
|
|
0xA3: b"\xc2\xa3", |
|
|
0xA4: b"\xc2\xa4", |
|
|
0xA5: b"\xc2\xa5", |
|
|
0xA6: b"\xc2\xa6", |
|
|
0xA7: b"\xc2\xa7", |
|
|
0xA8: b"\xc2\xa8", |
|
|
0xA9: b"\xc2\xa9", |
|
|
0xAA: b"\xc2\xaa", |
|
|
0xAB: b"\xc2\xab", |
|
|
0xAC: b"\xc2\xac", |
|
|
0xAD: b"\xc2\xad", |
|
|
0xAE: b"\xc2\xae", |
|
|
0xAF: b"\xc2\xaf", |
|
|
0xB0: b"\xc2\xb0", |
|
|
0xB1: b"\xc2\xb1", |
|
|
0xB2: b"\xc2\xb2", |
|
|
0xB3: b"\xc2\xb3", |
|
|
0xB4: b"\xc2\xb4", |
|
|
0xB5: b"\xc2\xb5", |
|
|
0xB6: b"\xc2\xb6", |
|
|
0xB7: b"\xc2\xb7", |
|
|
0xB8: b"\xc2\xb8", |
|
|
0xB9: b"\xc2\xb9", |
|
|
0xBA: b"\xc2\xba", |
|
|
0xBB: b"\xc2\xbb", |
|
|
0xBC: b"\xc2\xbc", |
|
|
0xBD: b"\xc2\xbd", |
|
|
0xBE: b"\xc2\xbe", |
|
|
0xBF: b"\xc2\xbf", |
|
|
0xC0: b"\xc3\x80", |
|
|
0xC1: b"\xc3\x81", |
|
|
0xC2: b"\xc3\x82", |
|
|
0xC3: b"\xc3\x83", |
|
|
0xC4: b"\xc3\x84", |
|
|
0xC5: b"\xc3\x85", |
|
|
0xC6: b"\xc3\x86", |
|
|
0xC7: b"\xc3\x87", |
|
|
0xC8: b"\xc3\x88", |
|
|
0xC9: b"\xc3\x89", |
|
|
0xCA: b"\xc3\x8a", |
|
|
0xCB: b"\xc3\x8b", |
|
|
0xCC: b"\xc3\x8c", |
|
|
0xCD: b"\xc3\x8d", |
|
|
0xCE: b"\xc3\x8e", |
|
|
0xCF: b"\xc3\x8f", |
|
|
0xD0: b"\xc3\x90", |
|
|
0xD1: b"\xc3\x91", |
|
|
0xD2: b"\xc3\x92", |
|
|
0xD3: b"\xc3\x93", |
|
|
0xD4: b"\xc3\x94", |
|
|
0xD5: b"\xc3\x95", |
|
|
0xD6: b"\xc3\x96", |
|
|
0xD7: b"\xc3\x97", |
|
|
0xD8: b"\xc3\x98", |
|
|
0xD9: b"\xc3\x99", |
|
|
0xDA: b"\xc3\x9a", |
|
|
0xDB: b"\xc3\x9b", |
|
|
0xDC: b"\xc3\x9c", |
|
|
0xDD: b"\xc3\x9d", |
|
|
0xDE: b"\xc3\x9e", |
|
|
0xDF: b"\xc3\x9f", |
|
|
0xE0: b"\xc3\xa0", |
|
|
0xE1: b"\xa1", |
|
|
0xE2: b"\xc3\xa2", |
|
|
0xE3: b"\xc3\xa3", |
|
|
0xE4: b"\xc3\xa4", |
|
|
0xE5: b"\xc3\xa5", |
|
|
0xE6: b"\xc3\xa6", |
|
|
0xE7: b"\xc3\xa7", |
|
|
0xE8: b"\xc3\xa8", |
|
|
0xE9: b"\xc3\xa9", |
|
|
0xEA: b"\xc3\xaa", |
|
|
0xEB: b"\xc3\xab", |
|
|
0xEC: b"\xc3\xac", |
|
|
0xED: b"\xc3\xad", |
|
|
0xEE: b"\xc3\xae", |
|
|
0xEF: b"\xc3\xaf", |
|
|
0xF0: b"\xc3\xb0", |
|
|
0xF1: b"\xc3\xb1", |
|
|
0xF2: b"\xc3\xb2", |
|
|
0xF3: b"\xc3\xb3", |
|
|
0xF4: b"\xc3\xb4", |
|
|
0xF5: b"\xc3\xb5", |
|
|
0xF6: b"\xc3\xb6", |
|
|
0xF7: b"\xc3\xb7", |
|
|
0xF8: b"\xc3\xb8", |
|
|
0xF9: b"\xc3\xb9", |
|
|
0xFA: b"\xc3\xba", |
|
|
0xFB: b"\xc3\xbb", |
|
|
0xFC: b"\xc3\xbc", |
|
|
0xFD: b"\xc3\xbd", |
|
|
0xFE: b"\xc3\xbe", |
|
|
} |
|
|
|
|
|
|
|
|
MULTIBYTE_MARKERS_AND_SIZES: List[Tuple[int, int, int]] = [ |
|
|
(0xC2, 0xDF, 2), |
|
|
(0xE0, 0xEF, 3), |
|
|
(0xF0, 0xF4, 4), |
|
|
] |
|
|
|
|
|
|
|
|
FIRST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[0][0] |
|
|
|
|
|
|
|
|
LAST_MULTIBYTE_MARKER: int = MULTIBYTE_MARKERS_AND_SIZES[-1][1] |
|
|
|
|
|
@classmethod |
|
|
def detwingle( |
|
|
cls, |
|
|
in_bytes: bytes, |
|
|
main_encoding: _Encoding = "utf8", |
|
|
embedded_encoding: _Encoding = "windows-1252", |
|
|
) -> bytes: |
|
|
"""Fix characters from one encoding embedded in some other encoding. |
|
|
|
|
|
Currently the only situation supported is Windows-1252 (or its |
|
|
subset ISO-8859-1), embedded in UTF-8. |
|
|
|
|
|
:param in_bytes: A bytestring that you suspect contains |
|
|
characters from multiple encodings. Note that this *must* |
|
|
be a bytestring. If you've already converted the document |
|
|
to Unicode, you're too late. |
|
|
:param main_encoding: The primary encoding of ``in_bytes``. |
|
|
:param embedded_encoding: The encoding that was used to embed characters |
|
|
in the main document. |
|
|
:return: A bytestring similar to ``in_bytes``, in which |
|
|
``embedded_encoding`` characters have been converted to |
|
|
their ``main_encoding`` equivalents. |
|
|
""" |
|
|
if embedded_encoding.replace("_", "-").lower() not in ( |
|
|
"windows-1252", |
|
|
"windows_1252", |
|
|
): |
|
|
raise NotImplementedError( |
|
|
"Windows-1252 and ISO-8859-1 are the only currently supported " |
|
|
"embedded encodings." |
|
|
) |
|
|
|
|
|
if main_encoding.lower() not in ("utf8", "utf-8"): |
|
|
raise NotImplementedError( |
|
|
"UTF-8 is the only currently supported main encoding." |
|
|
) |
|
|
|
|
|
byte_chunks = [] |
|
|
|
|
|
chunk_start = 0 |
|
|
pos = 0 |
|
|
while pos < len(in_bytes): |
|
|
byte = in_bytes[pos] |
|
|
if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER: |
|
|
|
|
|
|
|
|
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: |
|
|
if byte >= start and byte <= end: |
|
|
pos += size |
|
|
break |
|
|
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: |
|
|
|
|
|
|
|
|
byte_chunks.append(in_bytes[chunk_start:pos]) |
|
|
|
|
|
|
|
|
|
|
|
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) |
|
|
pos += 1 |
|
|
chunk_start = pos |
|
|
else: |
|
|
|
|
|
pos += 1 |
|
|
if chunk_start == 0: |
|
|
|
|
|
return in_bytes |
|
|
else: |
|
|
|
|
|
byte_chunks.append(in_bytes[chunk_start:]) |
|
|
return b"".join(byte_chunks) |
|
|
|