dishitanagi commited on
Commit
7bb797c
·
verified ·
1 Parent(s): 030be37

Upload homoglyphs.py

Browse files
Files changed (1) hide show
  1. homoglyphs.py +265 -0
homoglyphs.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Updated version of core.py from
2
+ https://github.com/yamatt/homoglyphs/tree/main/homoglyphs_fork
3
+ for modern python3
4
+ """
5
+
6
+ from collections import defaultdict
7
+ import json
8
+ from itertools import product
9
+ import os
10
+ import unicodedata
11
+
12
+ # Actions if char not in alphabet
13
+ STRATEGY_LOAD = 1 # load category for this char
14
+ STRATEGY_IGNORE = 2 # add char to result
15
+ STRATEGY_REMOVE = 3 # remove char from result
16
+
17
+ ASCII_RANGE = range(128)
18
+
19
+
20
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
21
+ DATA_LOCATION = os.path.join(CURRENT_DIR, "homoglyph_data")
22
+
23
+
24
+ class Categories:
25
+ """
26
+ Work with aliases from ISO 15924.
27
+ https://en.wikipedia.org/wiki/ISO_15924#List_of_codes
28
+ """
29
+
30
+ fpath = os.path.join(DATA_LOCATION, "categories.json")
31
+
32
+ @classmethod
33
+ def _get_ranges(cls, categories):
34
+ """
35
+ :return: iter: (start code, end code)
36
+ :rtype: list
37
+ """
38
+ with open(cls.fpath, encoding="utf-8") as f:
39
+ data = json.load(f)
40
+
41
+ for category in categories:
42
+ if category not in data["aliases"]:
43
+ raise ValueError("Invalid category: {}".format(category))
44
+
45
+ for point in data["points"]:
46
+ if point[2] in categories:
47
+ yield point[:2]
48
+
49
+ @classmethod
50
+ def get_alphabet(cls, categories):
51
+ """
52
+ :return: set of chars in alphabet by categories list
53
+ :rtype: set
54
+ """
55
+ alphabet = set()
56
+ for start, end in cls._get_ranges(categories):
57
+ chars = (chr(code) for code in range(start, end + 1))
58
+ alphabet.update(chars)
59
+ return alphabet
60
+
61
+ @classmethod
62
+ def detect(cls, char):
63
+ """
64
+ :return: category
65
+ :rtype: str
66
+ """
67
+ with open(cls.fpath, encoding="utf-8") as f:
68
+ data = json.load(f)
69
+
70
+ # try detect category by unicodedata
71
+ try:
72
+ category = unicodedata.name(char).split()[0]
73
+ except (TypeError, ValueError):
74
+ # In Python2 unicodedata.name raise error for non-unicode chars
75
+ # Python3 raise ValueError for non-unicode characters
76
+ pass
77
+ else:
78
+ if category in data["aliases"]:
79
+ return category
80
+
81
+ # try detect category by ranges from JSON file.
82
+ code = ord(char)
83
+ for point in data["points"]:
84
+ if point[0] <= code <= point[1]:
85
+ return point[2]
86
+
87
+ @classmethod
88
+ def get_all(cls):
89
+ with open(cls.fpath, encoding="utf-8") as f:
90
+ data = json.load(f)
91
+ return set(data["aliases"])
92
+
93
+
94
+ class Languages:
95
+ fpath = os.path.join(DATA_LOCATION, "languages.json")
96
+
97
+ @classmethod
98
+ def get_alphabet(cls, languages):
99
+ """
100
+ :return: set of chars in alphabet by languages list
101
+ :rtype: set
102
+ """
103
+ with open(cls.fpath, encoding="utf-8") as f:
104
+ data = json.load(f)
105
+ alphabet = set()
106
+ for lang in languages:
107
+ if lang not in data:
108
+ raise ValueError("Invalid language code: {}".format(lang))
109
+ alphabet.update(data[lang])
110
+ return alphabet
111
+
112
+ @classmethod
113
+ def detect(cls, char):
114
+ """
115
+ :return: set of languages which alphabet contains passed char.
116
+ :rtype: set
117
+ """
118
+ with open(cls.fpath, encoding="utf-8") as f:
119
+ data = json.load(f)
120
+ languages = set()
121
+ for lang, alphabet in data.items():
122
+ if char in alphabet:
123
+ languages.add(lang)
124
+ return languages
125
+
126
+ @classmethod
127
+ def get_all(cls):
128
+ with open(cls.fpath, encoding="utf-8") as f:
129
+ data = json.load(f)
130
+ return set(data.keys())
131
+
132
+
133
+ class Homoglyphs:
134
+ def __init__(
135
+ self,
136
+ categories=None,
137
+ languages=None,
138
+ alphabet=None,
139
+ strategy=STRATEGY_IGNORE,
140
+ ascii_strategy=STRATEGY_IGNORE,
141
+ ascii_range=ASCII_RANGE,
142
+ ):
143
+ # strategies
144
+ if strategy not in (STRATEGY_LOAD, STRATEGY_IGNORE, STRATEGY_REMOVE):
145
+ raise ValueError("Invalid strategy")
146
+ self.strategy = strategy
147
+ self.ascii_strategy = ascii_strategy
148
+ self.ascii_range = ascii_range
149
+
150
+ # Homoglyphs must be initialized by any alphabet for correct work
151
+ if not categories and not languages and not alphabet:
152
+ categories = ("LATIN", "COMMON")
153
+
154
+ # cats and langs
155
+ self.categories = set(categories or [])
156
+ self.languages = set(languages or [])
157
+
158
+ # alphabet
159
+ self.alphabet = set(alphabet or [])
160
+ if self.categories:
161
+ alphabet = Categories.get_alphabet(self.categories)
162
+ self.alphabet.update(alphabet)
163
+ if self.languages:
164
+ alphabet = Languages.get_alphabet(self.languages)
165
+ self.alphabet.update(alphabet)
166
+ self.table = self.get_table(self.alphabet)
167
+
168
+ @staticmethod
169
+ def get_table(alphabet):
170
+ table = defaultdict(set)
171
+ with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f:
172
+ data = json.load(f)
173
+ for char in alphabet:
174
+ if char in data:
175
+ for homoglyph in data[char]:
176
+ if homoglyph in alphabet:
177
+ table[char].add(homoglyph)
178
+ return table
179
+
180
+ @staticmethod
181
+ def get_restricted_table(source_alphabet, target_alphabet):
182
+ table = defaultdict(set)
183
+ with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f:
184
+ data = json.load(f)
185
+ for char in source_alphabet:
186
+ if char in data:
187
+ for homoglyph in data[char]:
188
+ if homoglyph in target_alphabet:
189
+ table[char].add(homoglyph)
190
+ return table
191
+
192
+ @staticmethod
193
+ def uniq_and_sort(data):
194
+ result = list(set(data))
195
+ result.sort(key=lambda x: (-len(x), x))
196
+ return result
197
+
198
+ def _update_alphabet(self, char):
199
+ # try detect languages
200
+ langs = Languages.detect(char)
201
+ if langs:
202
+ self.languages.update(langs)
203
+ alphabet = Languages.get_alphabet(langs)
204
+ self.alphabet.update(alphabet)
205
+ else:
206
+ # try detect categories
207
+ category = Categories.detect(char)
208
+ if category is None:
209
+ return False
210
+ self.categories.add(category)
211
+ alphabet = Categories.get_alphabet([category])
212
+ self.alphabet.update(alphabet)
213
+ # update table for new alphabet
214
+ self.table = self.get_table(self.alphabet)
215
+ return True
216
+
217
+ def _get_char_variants(self, char):
218
+ if char not in self.alphabet:
219
+ if self.strategy == STRATEGY_LOAD:
220
+ if not self._update_alphabet(char):
221
+ return []
222
+ elif self.strategy == STRATEGY_IGNORE:
223
+ return [char]
224
+ elif self.strategy == STRATEGY_REMOVE:
225
+ return []
226
+
227
+ # find alternative chars for current char
228
+ alt_chars = self.table.get(char, set())
229
+ if alt_chars:
230
+ # find alternative chars for alternative chars for current char
231
+ alt_chars2 = [self.table.get(alt_char, set()) for alt_char in alt_chars]
232
+ # combine all alternatives
233
+ alt_chars.update(*alt_chars2)
234
+ # add current char to alternatives
235
+ alt_chars.add(char)
236
+
237
+ # uniq, sort and return
238
+ return self.uniq_and_sort(alt_chars)
239
+
240
+ def _get_combinations(self, text, ascii=False):
241
+ variations = []
242
+ for char in text:
243
+ alt_chars = self._get_char_variants(char)
244
+
245
+ if ascii:
246
+ alt_chars = [char for char in alt_chars if ord(char) in self.ascii_range]
247
+ if not alt_chars and self.ascii_strategy == STRATEGY_IGNORE:
248
+ return
249
+
250
+ if alt_chars:
251
+ variations.append(alt_chars)
252
+ if variations:
253
+ for variant in product(*variations):
254
+ yield "".join(variant)
255
+
256
+ def get_combinations(self, text):
257
+ return list(self._get_combinations(text))
258
+
259
+ def _to_ascii(self, text):
260
+ for variant in self._get_combinations(text, ascii=True):
261
+ if max(map(ord, variant)) in self.ascii_range:
262
+ yield variant
263
+
264
+ def to_ascii(self, text):
265
+ return self.uniq_and_sort(self._to_ascii(text))