| | from __future__ import annotations |
| |
|
| | from encodings.aliases import aliases |
| | from hashlib import sha256 |
| | from json import dumps |
| | from re import sub |
| | from typing import Any, Iterator, List, Tuple |
| |
|
| | from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE |
| | from .utils import iana_name, is_multi_byte_encoding, unicode_range |
| |
|
| |
|
| | class CharsetMatch: |
| | def __init__( |
| | self, |
| | payload: bytes, |
| | guessed_encoding: str, |
| | mean_mess_ratio: float, |
| | has_sig_or_bom: bool, |
| | languages: CoherenceMatches, |
| | decoded_payload: str | None = None, |
| | preemptive_declaration: str | None = None, |
| | ): |
| | self._payload: bytes = payload |
| |
|
| | self._encoding: str = guessed_encoding |
| | self._mean_mess_ratio: float = mean_mess_ratio |
| | self._languages: CoherenceMatches = languages |
| | self._has_sig_or_bom: bool = has_sig_or_bom |
| | self._unicode_ranges: list[str] | None = None |
| |
|
| | self._leaves: list[CharsetMatch] = [] |
| | self._mean_coherence_ratio: float = 0.0 |
| |
|
| | self._output_payload: bytes | None = None |
| | self._output_encoding: str | None = None |
| |
|
| | self._string: str | None = decoded_payload |
| |
|
| | self._preemptive_declaration: str | None = preemptive_declaration |
| |
|
| | def __eq__(self, other: object) -> bool: |
| | if not isinstance(other, CharsetMatch): |
| | if isinstance(other, str): |
| | return iana_name(other) == self.encoding |
| | return False |
| | return self.encoding == other.encoding and self.fingerprint == other.fingerprint |
| |
|
| | def __lt__(self, other: object) -> bool: |
| | """ |
| | Implemented to make sorted available upon CharsetMatches items. |
| | """ |
| | if not isinstance(other, CharsetMatch): |
| | raise ValueError |
| |
|
| | chaos_difference: float = abs(self.chaos - other.chaos) |
| | coherence_difference: float = abs(self.coherence - other.coherence) |
| |
|
| | |
| | if chaos_difference < 0.01 and coherence_difference > 0.02: |
| | return self.coherence > other.coherence |
| | elif chaos_difference < 0.01 and coherence_difference <= 0.02: |
| | |
| | |
| | if len(self._payload) >= TOO_BIG_SEQUENCE: |
| | return self.chaos < other.chaos |
| | return self.multi_byte_usage > other.multi_byte_usage |
| |
|
| | return self.chaos < other.chaos |
| |
|
| | @property |
| | def multi_byte_usage(self) -> float: |
| | return 1.0 - (len(str(self)) / len(self.raw)) |
| |
|
| | def __str__(self) -> str: |
| | |
| | if self._string is None: |
| | self._string = str(self._payload, self._encoding, "strict") |
| | return self._string |
| |
|
| | def __repr__(self) -> str: |
| | return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>" |
| |
|
| | def add_submatch(self, other: CharsetMatch) -> None: |
| | if not isinstance(other, CharsetMatch) or other == self: |
| | raise ValueError( |
| | "Unable to add instance <{}> as a submatch of a CharsetMatch".format( |
| | other.__class__ |
| | ) |
| | ) |
| |
|
| | other._string = None |
| | self._leaves.append(other) |
| |
|
| | @property |
| | def encoding(self) -> str: |
| | return self._encoding |
| |
|
| | @property |
| | def encoding_aliases(self) -> list[str]: |
| | """ |
| | Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. |
| | """ |
| | also_known_as: list[str] = [] |
| | for u, p in aliases.items(): |
| | if self.encoding == u: |
| | also_known_as.append(p) |
| | elif self.encoding == p: |
| | also_known_as.append(u) |
| | return also_known_as |
| |
|
| | @property |
| | def bom(self) -> bool: |
| | return self._has_sig_or_bom |
| |
|
| | @property |
| | def byte_order_mark(self) -> bool: |
| | return self._has_sig_or_bom |
| |
|
| | @property |
| | def languages(self) -> list[str]: |
| | """ |
| | Return the complete list of possible languages found in decoded sequence. |
| | Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. |
| | """ |
| | return [e[0] for e in self._languages] |
| |
|
| | @property |
| | def language(self) -> str: |
| | """ |
| | Most probable language found in decoded sequence. If none were detected or inferred, the property will return |
| | "Unknown". |
| | """ |
| | if not self._languages: |
| | |
| | |
| | if "ascii" in self.could_be_from_charset: |
| | return "English" |
| |
|
| | |
| | from charset_normalizer.cd import encoding_languages, mb_encoding_languages |
| |
|
| | languages = ( |
| | mb_encoding_languages(self.encoding) |
| | if is_multi_byte_encoding(self.encoding) |
| | else encoding_languages(self.encoding) |
| | ) |
| |
|
| | if len(languages) == 0 or "Latin Based" in languages: |
| | return "Unknown" |
| |
|
| | return languages[0] |
| |
|
| | return self._languages[0][0] |
| |
|
| | @property |
| | def chaos(self) -> float: |
| | return self._mean_mess_ratio |
| |
|
| | @property |
| | def coherence(self) -> float: |
| | if not self._languages: |
| | return 0.0 |
| | return self._languages[0][1] |
| |
|
| | @property |
| | def percent_chaos(self) -> float: |
| | return round(self.chaos * 100, ndigits=3) |
| |
|
| | @property |
| | def percent_coherence(self) -> float: |
| | return round(self.coherence * 100, ndigits=3) |
| |
|
| | @property |
| | def raw(self) -> bytes: |
| | """ |
| | Original untouched bytes. |
| | """ |
| | return self._payload |
| |
|
| | @property |
| | def submatch(self) -> list[CharsetMatch]: |
| | return self._leaves |
| |
|
| | @property |
| | def has_submatch(self) -> bool: |
| | return len(self._leaves) > 0 |
| |
|
| | @property |
| | def alphabets(self) -> list[str]: |
| | if self._unicode_ranges is not None: |
| | return self._unicode_ranges |
| | |
| | detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)] |
| | |
| | self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) |
| | return self._unicode_ranges |
| |
|
| | @property |
| | def could_be_from_charset(self) -> list[str]: |
| | """ |
| | The complete list of encoding that output the exact SAME str result and therefore could be the originating |
| | encoding. |
| | This list does include the encoding available in property 'encoding'. |
| | """ |
| | return [self._encoding] + [m.encoding for m in self._leaves] |
| |
|
| | def output(self, encoding: str = "utf_8") -> bytes: |
| | """ |
| | Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. |
| | Any errors will be simply ignored by the encoder NOT replaced. |
| | """ |
| | if self._output_encoding is None or self._output_encoding != encoding: |
| | self._output_encoding = encoding |
| | decoded_string = str(self) |
| | if ( |
| | self._preemptive_declaration is not None |
| | and self._preemptive_declaration.lower() |
| | not in ["utf-8", "utf8", "utf_8"] |
| | ): |
| | patched_header = sub( |
| | RE_POSSIBLE_ENCODING_INDICATION, |
| | lambda m: m.string[m.span()[0] : m.span()[1]].replace( |
| | m.groups()[0], |
| | iana_name(self._output_encoding).replace("_", "-"), |
| | ), |
| | decoded_string[:8192], |
| | count=1, |
| | ) |
| |
|
| | decoded_string = patched_header + decoded_string[8192:] |
| |
|
| | self._output_payload = decoded_string.encode(encoding, "replace") |
| |
|
| | return self._output_payload |
| |
|
| | @property |
| | def fingerprint(self) -> str: |
| | """ |
| | Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one. |
| | """ |
| | return sha256(self.output()).hexdigest() |
| |
|
| |
|
| | class CharsetMatches: |
| | """ |
| | Container with every CharsetMatch items ordered by default from most probable to the less one. |
| | Act like a list(iterable) but does not implements all related methods. |
| | """ |
| |
|
| | def __init__(self, results: list[CharsetMatch] | None = None): |
| | self._results: list[CharsetMatch] = sorted(results) if results else [] |
| |
|
| | def __iter__(self) -> Iterator[CharsetMatch]: |
| | yield from self._results |
| |
|
| | def __getitem__(self, item: int | str) -> CharsetMatch: |
| | """ |
| | Retrieve a single item either by its position or encoding name (alias may be used here). |
| | Raise KeyError upon invalid index or encoding not present in results. |
| | """ |
| | if isinstance(item, int): |
| | return self._results[item] |
| | if isinstance(item, str): |
| | item = iana_name(item, False) |
| | for result in self._results: |
| | if item in result.could_be_from_charset: |
| | return result |
| | raise KeyError |
| |
|
| | def __len__(self) -> int: |
| | return len(self._results) |
| |
|
| | def __bool__(self) -> bool: |
| | return len(self._results) > 0 |
| |
|
| | def append(self, item: CharsetMatch) -> None: |
| | """ |
| | Insert a single match. Will be inserted accordingly to preserve sort. |
| | Can be inserted as a submatch. |
| | """ |
| | if not isinstance(item, CharsetMatch): |
| | raise ValueError( |
| | "Cannot append instance '{}' to CharsetMatches".format( |
| | str(item.__class__) |
| | ) |
| | ) |
| | |
| | if len(item.raw) < TOO_BIG_SEQUENCE: |
| | for match in self._results: |
| | if match.fingerprint == item.fingerprint and match.chaos == item.chaos: |
| | match.add_submatch(item) |
| | return |
| | self._results.append(item) |
| | self._results = sorted(self._results) |
| |
|
| | def best(self) -> CharsetMatch | None: |
| | """ |
| | Simply return the first match. Strict equivalent to matches[0]. |
| | """ |
| | if not self._results: |
| | return None |
| | return self._results[0] |
| |
|
| | def first(self) -> CharsetMatch | None: |
| | """ |
| | Redundant method, call the method best(). Kept for BC reasons. |
| | """ |
| | return self.best() |
| |
|
| |
|
| | CoherenceMatch = Tuple[str, float] |
| | CoherenceMatches = List[CoherenceMatch] |
| |
|
| |
|
| | class CliDetectionResult: |
| | def __init__( |
| | self, |
| | path: str, |
| | encoding: str | None, |
| | encoding_aliases: list[str], |
| | alternative_encodings: list[str], |
| | language: str, |
| | alphabets: list[str], |
| | has_sig_or_bom: bool, |
| | chaos: float, |
| | coherence: float, |
| | unicode_path: str | None, |
| | is_preferred: bool, |
| | ): |
| | self.path: str = path |
| | self.unicode_path: str | None = unicode_path |
| | self.encoding: str | None = encoding |
| | self.encoding_aliases: list[str] = encoding_aliases |
| | self.alternative_encodings: list[str] = alternative_encodings |
| | self.language: str = language |
| | self.alphabets: list[str] = alphabets |
| | self.has_sig_or_bom: bool = has_sig_or_bom |
| | self.chaos: float = chaos |
| | self.coherence: float = coherence |
| | self.is_preferred: bool = is_preferred |
| |
|
| | @property |
| | def __dict__(self) -> dict[str, Any]: |
| | return { |
| | "path": self.path, |
| | "encoding": self.encoding, |
| | "encoding_aliases": self.encoding_aliases, |
| | "alternative_encodings": self.alternative_encodings, |
| | "language": self.language, |
| | "alphabets": self.alphabets, |
| | "has_sig_or_bom": self.has_sig_or_bom, |
| | "chaos": self.chaos, |
| | "coherence": self.coherence, |
| | "unicode_path": self.unicode_path, |
| | "is_preferred": self.is_preferred, |
| | } |
| |
|
| | def to_json(self) -> str: |
| | return dumps(self.__dict__, ensure_ascii=True, indent=4) |
| |
|