| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import logging |
| import re |
|
|
| from .enums import ProbingState |
|
|
| INTERNATIONAL_WORDS_PATTERN = re.compile( |
| b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?" |
| ) |
|
|
|
|
| class CharSetProber: |
|
|
| SHORTCUT_THRESHOLD = 0.95 |
|
|
| def __init__(self, lang_filter=None): |
| self._state = None |
| self.lang_filter = lang_filter |
| self.logger = logging.getLogger(__name__) |
|
|
| def reset(self): |
| self._state = ProbingState.DETECTING |
|
|
| @property |
| def charset_name(self): |
| return None |
|
|
| def feed(self, byte_str): |
| raise NotImplementedError |
|
|
| @property |
| def state(self): |
| return self._state |
|
|
| def get_confidence(self): |
| return 0.0 |
|
|
| @staticmethod |
| def filter_high_byte_only(buf): |
| buf = re.sub(b"([\x00-\x7F])+", b" ", buf) |
| return buf |
|
|
| @staticmethod |
| def filter_international_words(buf): |
| """ |
| We define three types of bytes: |
| alphabet: english alphabets [a-zA-Z] |
| international: international characters [\x80-\xFF] |
| marker: everything else [^a-zA-Z\x80-\xFF] |
| The input buffer can be thought to contain a series of words delimited |
| by markers. This function works to filter all words that contain at |
| least one international character. All contiguous sequences of markers |
| are replaced by a single space ascii character. |
| This filter applies to all scripts which do not use English characters. |
| """ |
| filtered = bytearray() |
|
|
| |
| |
| |
| words = INTERNATIONAL_WORDS_PATTERN.findall(buf) |
|
|
| for word in words: |
| filtered.extend(word[:-1]) |
|
|
| |
| |
| |
| |
| last_char = word[-1:] |
| if not last_char.isalpha() and last_char < b"\x80": |
| last_char = b" " |
| filtered.extend(last_char) |
|
|
| return filtered |
|
|
| @staticmethod |
| def remove_xml_tags(buf): |
| """ |
| Returns a copy of ``buf`` that retains only the sequences of English |
| alphabet and high byte characters that are not between <> characters. |
| This filter can be applied to all scripts which contain both English |
| characters and extended ASCII characters, but is currently only used by |
| ``Latin1Prober``. |
| """ |
| filtered = bytearray() |
| in_tag = False |
| prev = 0 |
| buf = memoryview(buf).cast("c") |
|
|
| for curr, buf_char in enumerate(buf): |
| |
| if buf_char == b">": |
| prev = curr + 1 |
| in_tag = False |
| elif buf_char == b"<": |
| if curr > prev and not in_tag: |
| |
| |
| filtered.extend(buf[prev:curr]) |
| |
| filtered.extend(b" ") |
| in_tag = True |
|
|
| |
| if not in_tag: |
| |
| |
| filtered.extend(buf[prev:]) |
|
|
| return filtered |
|
|