| | from __future__ import annotations |
| |
|
| | from typing import TYPE_CHECKING, Any |
| | from warnings import warn |
| |
|
| | from .api import from_bytes |
| | from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE |
| |
|
| | |
| | if TYPE_CHECKING: |
| | from typing_extensions import TypedDict |
| |
|
| | class ResultDict(TypedDict): |
| | encoding: str | None |
| | language: str |
| | confidence: float | None |
| |
|
| |
|
| | def detect( |
| | byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any |
| | ) -> ResultDict: |
| | """ |
| | chardet legacy method |
| | Detect the encoding of the given byte string. It should be mostly backward-compatible. |
| | Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it) |
| | This function is deprecated and should be used to migrate your project easily, consult the documentation for |
| | further information. Not planned for removal. |
| | |
| | :param byte_str: The byte sequence to examine. |
| | :param should_rename_legacy: Should we rename legacy encodings |
| | to their more modern equivalents? |
| | """ |
| | if len(kwargs): |
| | warn( |
| | f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()" |
| | ) |
| |
|
| | if not isinstance(byte_str, (bytearray, bytes)): |
| | raise TypeError( |
| | f"Expected object of type bytes or bytearray, got: {type(byte_str)}" |
| | ) |
| |
|
| | if isinstance(byte_str, bytearray): |
| | byte_str = bytes(byte_str) |
| |
|
| | r = from_bytes(byte_str).best() |
| |
|
| | encoding = r.encoding if r is not None else None |
| | language = r.language if r is not None and r.language != "Unknown" else "" |
| | confidence = 1.0 - r.chaos if r is not None else None |
| |
|
| | |
| | |
| | |
| | if ( |
| | confidence is not None |
| | and confidence >= 0.9 |
| | and encoding |
| | not in { |
| | "utf_8", |
| | "ascii", |
| | } |
| | and r.bom is False |
| | and len(byte_str) < TOO_SMALL_SEQUENCE |
| | ): |
| | confidence -= 0.2 |
| |
|
| | |
| | |
| | if r is not None and encoding == "utf_8" and r.bom: |
| | encoding += "_sig" |
| |
|
| | if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE: |
| | encoding = CHARDET_CORRESPONDENCE[encoding] |
| |
|
| | return { |
| | "encoding": encoding, |
| | "language": language, |
| | "confidence": confidence, |
| | } |
| |
|