| | from __future__ import annotations |
| |
|
| | import importlib |
| | from codecs import IncrementalDecoder |
| | from collections import Counter |
| | from functools import lru_cache |
| | from typing import Counter as TypeCounter |
| |
|
| | from .constant import ( |
| | FREQUENCIES, |
| | KO_NAMES, |
| | LANGUAGE_SUPPORTED_COUNT, |
| | TOO_SMALL_SEQUENCE, |
| | ZH_NAMES, |
| | ) |
| | from .md import is_suspiciously_successive_range |
| | from .models import CoherenceMatches |
| | from .utils import ( |
| | is_accentuated, |
| | is_latin, |
| | is_multi_byte_encoding, |
| | is_unicode_range_secondary, |
| | unicode_range, |
| | ) |
| |
|
| |
|
| | def encoding_unicode_range(iana_name: str) -> list[str]: |
| | """ |
| | Return associated unicode ranges in a single byte code page. |
| | """ |
| | if is_multi_byte_encoding(iana_name): |
| | raise OSError("Function not supported on multi-byte code page") |
| |
|
| | decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder |
| |
|
| | p: IncrementalDecoder = decoder(errors="ignore") |
| | seen_ranges: dict[str, int] = {} |
| | character_count: int = 0 |
| |
|
| | for i in range(0x40, 0xFF): |
| | chunk: str = p.decode(bytes([i])) |
| |
|
| | if chunk: |
| | character_range: str | None = unicode_range(chunk) |
| |
|
| | if character_range is None: |
| | continue |
| |
|
| | if is_unicode_range_secondary(character_range) is False: |
| | if character_range not in seen_ranges: |
| | seen_ranges[character_range] = 0 |
| | seen_ranges[character_range] += 1 |
| | character_count += 1 |
| |
|
| | return sorted( |
| | [ |
| | character_range |
| | for character_range in seen_ranges |
| | if seen_ranges[character_range] / character_count >= 0.15 |
| | ] |
| | ) |
| |
|
| |
|
| | def unicode_range_languages(primary_range: str) -> list[str]: |
| | """ |
| | Return inferred languages used with a unicode range. |
| | """ |
| | languages: list[str] = [] |
| |
|
| | for language, characters in FREQUENCIES.items(): |
| | for character in characters: |
| | if unicode_range(character) == primary_range: |
| | languages.append(language) |
| | break |
| |
|
| | return languages |
| |
|
| |
|
| | @lru_cache() |
| | def encoding_languages(iana_name: str) -> list[str]: |
| | """ |
| | Single-byte encoding language association. Some code page are heavily linked to particular language(s). |
| | This function does the correspondence. |
| | """ |
| | unicode_ranges: list[str] = encoding_unicode_range(iana_name) |
| | primary_range: str | None = None |
| |
|
| | for specified_range in unicode_ranges: |
| | if "Latin" not in specified_range: |
| | primary_range = specified_range |
| | break |
| |
|
| | if primary_range is None: |
| | return ["Latin Based"] |
| |
|
| | return unicode_range_languages(primary_range) |
| |
|
| |
|
| | @lru_cache() |
| | def mb_encoding_languages(iana_name: str) -> list[str]: |
| | """ |
| | Multi-byte encoding language association. Some code page are heavily linked to particular language(s). |
| | This function does the correspondence. |
| | """ |
| | if ( |
| | iana_name.startswith("shift_") |
| | or iana_name.startswith("iso2022_jp") |
| | or iana_name.startswith("euc_j") |
| | or iana_name == "cp932" |
| | ): |
| | return ["Japanese"] |
| | if iana_name.startswith("gb") or iana_name in ZH_NAMES: |
| | return ["Chinese"] |
| | if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: |
| | return ["Korean"] |
| |
|
| | return [] |
| |
|
| |
|
| | @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) |
| | def get_target_features(language: str) -> tuple[bool, bool]: |
| | """ |
| | Determine main aspects from a supported language if it contains accents and if is pure Latin. |
| | """ |
| | target_have_accents: bool = False |
| | target_pure_latin: bool = True |
| |
|
| | for character in FREQUENCIES[language]: |
| | if not target_have_accents and is_accentuated(character): |
| | target_have_accents = True |
| | if target_pure_latin and is_latin(character) is False: |
| | target_pure_latin = False |
| |
|
| | return target_have_accents, target_pure_latin |
| |
|
| |
|
| | def alphabet_languages( |
| | characters: list[str], ignore_non_latin: bool = False |
| | ) -> list[str]: |
| | """ |
| | Return associated languages associated to given characters. |
| | """ |
| | languages: list[tuple[str, float]] = [] |
| |
|
| | source_have_accents = any(is_accentuated(character) for character in characters) |
| |
|
| | for language, language_characters in FREQUENCIES.items(): |
| | target_have_accents, target_pure_latin = get_target_features(language) |
| |
|
| | if ignore_non_latin and target_pure_latin is False: |
| | continue |
| |
|
| | if target_have_accents is False and source_have_accents: |
| | continue |
| |
|
| | character_count: int = len(language_characters) |
| |
|
| | character_match_count: int = len( |
| | [c for c in language_characters if c in characters] |
| | ) |
| |
|
| | ratio: float = character_match_count / character_count |
| |
|
| | if ratio >= 0.2: |
| | languages.append((language, ratio)) |
| |
|
| | languages = sorted(languages, key=lambda x: x[1], reverse=True) |
| |
|
| | return [compatible_language[0] for compatible_language in languages] |
| |
|
| |
|
| | def characters_popularity_compare( |
| | language: str, ordered_characters: list[str] |
| | ) -> float: |
| | """ |
| | Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. |
| | The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). |
| | Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) |
| | """ |
| | if language not in FREQUENCIES: |
| | raise ValueError(f"{language} not available") |
| |
|
| | character_approved_count: int = 0 |
| | FREQUENCIES_language_set = set(FREQUENCIES[language]) |
| |
|
| | ordered_characters_count: int = len(ordered_characters) |
| | target_language_characters_count: int = len(FREQUENCIES[language]) |
| |
|
| | large_alphabet: bool = target_language_characters_count > 26 |
| |
|
| | for character, character_rank in zip( |
| | ordered_characters, range(0, ordered_characters_count) |
| | ): |
| | if character not in FREQUENCIES_language_set: |
| | continue |
| |
|
| | character_rank_in_language: int = FREQUENCIES[language].index(character) |
| | expected_projection_ratio: float = ( |
| | target_language_characters_count / ordered_characters_count |
| | ) |
| | character_rank_projection: int = int(character_rank * expected_projection_ratio) |
| |
|
| | if ( |
| | large_alphabet is False |
| | and abs(character_rank_projection - character_rank_in_language) > 4 |
| | ): |
| | continue |
| |
|
| | if ( |
| | large_alphabet is True |
| | and abs(character_rank_projection - character_rank_in_language) |
| | < target_language_characters_count / 3 |
| | ): |
| | character_approved_count += 1 |
| | continue |
| |
|
| | characters_before_source: list[str] = FREQUENCIES[language][ |
| | 0:character_rank_in_language |
| | ] |
| | characters_after_source: list[str] = FREQUENCIES[language][ |
| | character_rank_in_language: |
| | ] |
| | characters_before: list[str] = ordered_characters[0:character_rank] |
| | characters_after: list[str] = ordered_characters[character_rank:] |
| |
|
| | before_match_count: int = len( |
| | set(characters_before) & set(characters_before_source) |
| | ) |
| |
|
| | after_match_count: int = len( |
| | set(characters_after) & set(characters_after_source) |
| | ) |
| |
|
| | if len(characters_before_source) == 0 and before_match_count <= 4: |
| | character_approved_count += 1 |
| | continue |
| |
|
| | if len(characters_after_source) == 0 and after_match_count <= 4: |
| | character_approved_count += 1 |
| | continue |
| |
|
| | if ( |
| | before_match_count / len(characters_before_source) >= 0.4 |
| | or after_match_count / len(characters_after_source) >= 0.4 |
| | ): |
| | character_approved_count += 1 |
| | continue |
| |
|
| | return character_approved_count / len(ordered_characters) |
| |
|
| |
|
| | def alpha_unicode_split(decoded_sequence: str) -> list[str]: |
| | """ |
| | Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. |
| | Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; |
| | One containing the latin letters and the other hebrew. |
| | """ |
| | layers: dict[str, str] = {} |
| |
|
| | for character in decoded_sequence: |
| | if character.isalpha() is False: |
| | continue |
| |
|
| | character_range: str | None = unicode_range(character) |
| |
|
| | if character_range is None: |
| | continue |
| |
|
| | layer_target_range: str | None = None |
| |
|
| | for discovered_range in layers: |
| | if ( |
| | is_suspiciously_successive_range(discovered_range, character_range) |
| | is False |
| | ): |
| | layer_target_range = discovered_range |
| | break |
| |
|
| | if layer_target_range is None: |
| | layer_target_range = character_range |
| |
|
| | if layer_target_range not in layers: |
| | layers[layer_target_range] = character.lower() |
| | continue |
| |
|
| | layers[layer_target_range] += character.lower() |
| |
|
| | return list(layers.values()) |
| |
|
| |
|
| | def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches: |
| | """ |
| | This function merge results previously given by the function coherence_ratio. |
| | The return type is the same as coherence_ratio. |
| | """ |
| | per_language_ratios: dict[str, list[float]] = {} |
| | for result in results: |
| | for sub_result in result: |
| | language, ratio = sub_result |
| | if language not in per_language_ratios: |
| | per_language_ratios[language] = [ratio] |
| | continue |
| | per_language_ratios[language].append(ratio) |
| |
|
| | merge = [ |
| | ( |
| | language, |
| | round( |
| | sum(per_language_ratios[language]) / len(per_language_ratios[language]), |
| | 4, |
| | ), |
| | ) |
| | for language in per_language_ratios |
| | ] |
| |
|
| | return sorted(merge, key=lambda x: x[1], reverse=True) |
| |
|
| |
|
| | def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: |
| | """ |
| | We shall NOT return "English—" in CoherenceMatches because it is an alternative |
| | of "English". This function only keeps the best match and remove the em-dash in it. |
| | """ |
| | index_results: dict[str, list[float]] = dict() |
| |
|
| | for result in results: |
| | language, ratio = result |
| | no_em_name: str = language.replace("—", "") |
| |
|
| | if no_em_name not in index_results: |
| | index_results[no_em_name] = [] |
| |
|
| | index_results[no_em_name].append(ratio) |
| |
|
| | if any(len(index_results[e]) > 1 for e in index_results): |
| | filtered_results: CoherenceMatches = [] |
| |
|
| | for language in index_results: |
| | filtered_results.append((language, max(index_results[language]))) |
| |
|
| | return filtered_results |
| |
|
| | return results |
| |
|
| |
|
| | @lru_cache(maxsize=2048) |
| | def coherence_ratio( |
| | decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None |
| | ) -> CoherenceMatches: |
| | """ |
| | Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. |
| | A layer = Character extraction by alphabets/ranges. |
| | """ |
| |
|
| | results: list[tuple[str, float]] = [] |
| | ignore_non_latin: bool = False |
| |
|
| | sufficient_match_count: int = 0 |
| |
|
| | lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] |
| | if "Latin Based" in lg_inclusion_list: |
| | ignore_non_latin = True |
| | lg_inclusion_list.remove("Latin Based") |
| |
|
| | for layer in alpha_unicode_split(decoded_sequence): |
| | sequence_frequencies: TypeCounter[str] = Counter(layer) |
| | most_common = sequence_frequencies.most_common() |
| |
|
| | character_count: int = sum(o for c, o in most_common) |
| |
|
| | if character_count <= TOO_SMALL_SEQUENCE: |
| | continue |
| |
|
| | popular_character_ordered: list[str] = [c for c, o in most_common] |
| |
|
| | for language in lg_inclusion_list or alphabet_languages( |
| | popular_character_ordered, ignore_non_latin |
| | ): |
| | ratio: float = characters_popularity_compare( |
| | language, popular_character_ordered |
| | ) |
| |
|
| | if ratio < threshold: |
| | continue |
| | elif ratio >= 0.8: |
| | sufficient_match_count += 1 |
| |
|
| | results.append((language, round(ratio, 4))) |
| |
|
| | if sufficient_match_count >= 3: |
| | break |
| |
|
| | return sorted( |
| | filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True |
| | ) |
| |
|