Spaces:
Runtime error
Runtime error
Mustafa Öztürk commited on
Commit ·
cce12b3
1
Parent(s): 67e1e39
Normalize spaced-letter obfuscation in text cleaning
Browse files- app/utils/text_utils.py +34 -1
app/utils/text_utils.py
CHANGED
|
@@ -2,6 +2,38 @@ import re
|
|
| 2 |
import unicodedata
|
| 3 |
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
def clean_text_nfkc(text: str) -> str:
|
| 6 |
text = unicodedata.normalize('NFKC', str(text))
|
| 7 |
text = text.replace('İ', 'i').replace('I', 'ı').lower()
|
|
@@ -11,7 +43,8 @@ def clean_text_nfkc(text: str) -> str:
|
|
| 11 |
text = text.replace(key, value)
|
| 12 |
# Keep natural double letters (e.g., "kullanici") and only squash exaggerated repeats.
|
| 13 |
text = re.sub(r'(.)\1{2,}', r'\1', text)
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
def check_blacklist(text: str, blacklist_set: set) -> bool:
|
|
|
|
| 2 |
import unicodedata
|
| 3 |
|
| 4 |
|
| 5 |
+
def _merge_spaced_letter_chains(text: str) -> str:
|
| 6 |
+
tokens = text.split()
|
| 7 |
+
if not tokens:
|
| 8 |
+
return text
|
| 9 |
+
|
| 10 |
+
merged = []
|
| 11 |
+
i = 0
|
| 12 |
+
n = len(tokens)
|
| 13 |
+
|
| 14 |
+
while i < n:
|
| 15 |
+
tok = tokens[i]
|
| 16 |
+
if len(tok) == 1 and tok.isalpha():
|
| 17 |
+
letters = [tok]
|
| 18 |
+
j = i + 1
|
| 19 |
+
while j < n and len(tokens[j]) == 1 and tokens[j].isalpha():
|
| 20 |
+
letters.append(tokens[j])
|
| 21 |
+
j += 1
|
| 22 |
+
|
| 23 |
+
# Join only real obfuscation chains like "g e r i z e k a l i".
|
| 24 |
+
if len(letters) >= 2:
|
| 25 |
+
merged.append("".join(letters))
|
| 26 |
+
else:
|
| 27 |
+
merged.append(tok)
|
| 28 |
+
i = j
|
| 29 |
+
continue
|
| 30 |
+
|
| 31 |
+
merged.append(tok)
|
| 32 |
+
i += 1
|
| 33 |
+
|
| 34 |
+
return " ".join(merged)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
def clean_text_nfkc(text: str) -> str:
|
| 38 |
text = unicodedata.normalize('NFKC', str(text))
|
| 39 |
text = text.replace('İ', 'i').replace('I', 'ı').lower()
|
|
|
|
| 43 |
text = text.replace(key, value)
|
| 44 |
# Keep natural double letters (e.g., "kullanici") and only squash exaggerated repeats.
|
| 45 |
text = re.sub(r'(.)\1{2,}', r'\1', text)
|
| 46 |
+
text = " ".join(text.split())
|
| 47 |
+
return _merge_spaced_letter_chains(text)
|
| 48 |
|
| 49 |
|
| 50 |
def check_blacklist(text: str, blacklist_set: set) -> bool:
|