Spaces:

dishitanagi
/

test3

Sleeping

App Files Files Community

dishitanagi commited on Dec 10, 2025

Commit

7bb797c

verified ·

1 Parent(s): 030be37

Upload homoglyphs.py

Browse files

Files changed (1) hide show

homoglyphs.py +265 -0

homoglyphs.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""Updated version of core.py from
+https://github.com/yamatt/homoglyphs/tree/main/homoglyphs_fork
+for modern python3
+"""
+from collections import defaultdict
+import json
+from itertools import product
+import os
+import unicodedata
+# Actions if char not in alphabet
+STRATEGY_LOAD = 1  # load category for this char
+STRATEGY_IGNORE = 2  # add char to result
+STRATEGY_REMOVE = 3  # remove char from result
+ASCII_RANGE = range(128)
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_LOCATION = os.path.join(CURRENT_DIR, "homoglyph_data")
+class Categories:
+    """
+    Work with aliases from ISO 15924.
+    https://en.wikipedia.org/wiki/ISO_15924#List_of_codes
+    """
+    fpath = os.path.join(DATA_LOCATION, "categories.json")
+    @classmethod
+    def _get_ranges(cls, categories):
+        """
+        :return: iter: (start code, end code)
+        :rtype: list
+        """
+        with open(cls.fpath, encoding="utf-8") as f:
+            data = json.load(f)
+        for category in categories:
+            if category not in data["aliases"]:
+                raise ValueError("Invalid category: {}".format(category))
+        for point in data["points"]:
+            if point[2] in categories:
+                yield point[:2]
+    @classmethod
+    def get_alphabet(cls, categories):
+        """
+        :return: set of chars in alphabet by categories list
+        :rtype: set
+        """
+        alphabet = set()
+        for start, end in cls._get_ranges(categories):
+            chars = (chr(code) for code in range(start, end + 1))
+            alphabet.update(chars)
+        return alphabet
+    @classmethod
+    def detect(cls, char):
+        """
+        :return: category
+        :rtype: str
+        """
+        with open(cls.fpath, encoding="utf-8") as f:
+            data = json.load(f)
+        # try detect category by unicodedata
+        try:
+            category = unicodedata.name(char).split()[0]
+        except (TypeError, ValueError):
+            # In Python2 unicodedata.name raise error for non-unicode chars
+            # Python3 raise ValueError for non-unicode characters
+            pass
+        else:
+            if category in data["aliases"]:
+                return category
+        # try detect category by ranges from JSON file.
+        code = ord(char)
+        for point in data["points"]:
+            if point[0] <= code <= point[1]:
+                return point[2]
+    @classmethod
+    def get_all(cls):
+        with open(cls.fpath, encoding="utf-8") as f:
+            data = json.load(f)
+        return set(data["aliases"])
+class Languages:
+    fpath = os.path.join(DATA_LOCATION, "languages.json")
+    @classmethod
+    def get_alphabet(cls, languages):
+        """
+        :return: set of chars in alphabet by languages list
+        :rtype: set
+        """
+        with open(cls.fpath, encoding="utf-8") as f:
+            data = json.load(f)
+        alphabet = set()
+        for lang in languages:
+            if lang not in data:
+                raise ValueError("Invalid language code: {}".format(lang))
+            alphabet.update(data[lang])
+        return alphabet
+    @classmethod
+    def detect(cls, char):
+        """
+        :return: set of languages which alphabet contains passed char.
+        :rtype: set
+        """
+        with open(cls.fpath, encoding="utf-8") as f:
+            data = json.load(f)
+        languages = set()
+        for lang, alphabet in data.items():
+            if char in alphabet:
+                languages.add(lang)
+        return languages
+    @classmethod
+    def get_all(cls):
+        with open(cls.fpath, encoding="utf-8") as f:
+            data = json.load(f)
+        return set(data.keys())
+class Homoglyphs:
+    def __init__(
+        self,
+        categories=None,
+        languages=None,
+        alphabet=None,
+        strategy=STRATEGY_IGNORE,
+        ascii_strategy=STRATEGY_IGNORE,
+        ascii_range=ASCII_RANGE,
+    ):
+        # strategies
+        if strategy not in (STRATEGY_LOAD, STRATEGY_IGNORE, STRATEGY_REMOVE):
+            raise ValueError("Invalid strategy")
+        self.strategy = strategy
+        self.ascii_strategy = ascii_strategy
+        self.ascii_range = ascii_range
+        # Homoglyphs must be initialized by any alphabet for correct work
+        if not categories and not languages and not alphabet:
+            categories = ("LATIN", "COMMON")
+        # cats and langs
+        self.categories = set(categories or [])
+        self.languages = set(languages or [])
+        # alphabet
+        self.alphabet = set(alphabet or [])
+        if self.categories:
+            alphabet = Categories.get_alphabet(self.categories)
+            self.alphabet.update(alphabet)
+        if self.languages:
+            alphabet = Languages.get_alphabet(self.languages)
+            self.alphabet.update(alphabet)
+        self.table = self.get_table(self.alphabet)
+    @staticmethod
+    def get_table(alphabet):
+        table = defaultdict(set)
+        with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f:
+            data = json.load(f)
+        for char in alphabet:
+            if char in data:
+                for homoglyph in data[char]:
+                    if homoglyph in alphabet:
+                        table[char].add(homoglyph)
+        return table
+    @staticmethod
+    def get_restricted_table(source_alphabet, target_alphabet):
+        table = defaultdict(set)
+        with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f:
+            data = json.load(f)
+        for char in source_alphabet:
+            if char in data:
+                for homoglyph in data[char]:
+                    if homoglyph in target_alphabet:
+                        table[char].add(homoglyph)
+        return table
+    @staticmethod
+    def uniq_and_sort(data):
+        result = list(set(data))
+        result.sort(key=lambda x: (-len(x), x))
+        return result
+    def _update_alphabet(self, char):
+        # try detect languages
+        langs = Languages.detect(char)
+        if langs:
+            self.languages.update(langs)
+            alphabet = Languages.get_alphabet(langs)
+            self.alphabet.update(alphabet)
+        else:
+            # try detect categories
+            category = Categories.detect(char)
+            if category is None:
+                return False
+            self.categories.add(category)
+            alphabet = Categories.get_alphabet([category])
+            self.alphabet.update(alphabet)
+        # update table for new alphabet
+        self.table = self.get_table(self.alphabet)
+        return True
+    def _get_char_variants(self, char):
+        if char not in self.alphabet:
+            if self.strategy == STRATEGY_LOAD:
+                if not self._update_alphabet(char):
+                    return []
+            elif self.strategy == STRATEGY_IGNORE:
+                return [char]
+            elif self.strategy == STRATEGY_REMOVE:
+                return []
+        # find alternative chars for current char
+        alt_chars = self.table.get(char, set())
+        if alt_chars:
+            # find alternative chars for alternative chars for current char
+            alt_chars2 = [self.table.get(alt_char, set()) for alt_char in alt_chars]
+            # combine all alternatives
+            alt_chars.update(*alt_chars2)
+        # add current char to alternatives
+        alt_chars.add(char)
+        # uniq, sort and return
+        return self.uniq_and_sort(alt_chars)
+    def _get_combinations(self, text, ascii=False):
+        variations = []
+        for char in text:
+            alt_chars = self._get_char_variants(char)
+            if ascii:
+                alt_chars = [char for char in alt_chars if ord(char) in self.ascii_range]
+                if not alt_chars and self.ascii_strategy == STRATEGY_IGNORE:
+                    return
+            if alt_chars:
+                variations.append(alt_chars)
+        if variations:
+            for variant in product(*variations):
+                yield "".join(variant)
+    def get_combinations(self, text):
+        return list(self._get_combinations(text))
+    def _to_ascii(self, text):
+        for variant in self._get_combinations(text, ascii=True):
+            if max(map(ord, variant)) in self.ascii_range:
+                yield variant
+    def to_ascii(self, text):
+        return self.uniq_and_sort(self._to_ascii(text))