Spaces:
Sleeping
Sleeping
Mustafa Öztürk commited on
Commit ·
ddd2daa
1
Parent(s): c6cc11a
Handle quoted spaced-letter obfuscation in normalization
Browse files- app/utils/text_utils.py +12 -4
app/utils/text_utils.py
CHANGED
|
@@ -3,6 +3,10 @@ import unicodedata
|
|
| 3 |
|
| 4 |
|
| 5 |
def _merge_spaced_letter_chains(text: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
tokens = text.split()
|
| 7 |
if not tokens:
|
| 8 |
return text
|
|
@@ -13,11 +17,15 @@ def _merge_spaced_letter_chains(text: str) -> str:
|
|
| 13 |
|
| 14 |
while i < n:
|
| 15 |
tok = tokens[i]
|
| 16 |
-
|
| 17 |
-
|
|
|
|
| 18 |
j = i + 1
|
| 19 |
-
while j < n
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
| 21 |
j += 1
|
| 22 |
|
| 23 |
# Join only real obfuscation chains like "g e r i z e k a l i".
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def _merge_spaced_letter_chains(text: str) -> str:
|
| 6 |
+
def _single_alpha(tok: str) -> str:
|
| 7 |
+
cleaned = re.sub(r"[^a-zA-ZçğıöşüÇĞİÖŞÜ]", "", tok)
|
| 8 |
+
return cleaned if len(cleaned) == 1 and cleaned.isalpha() else ""
|
| 9 |
+
|
| 10 |
tokens = text.split()
|
| 11 |
if not tokens:
|
| 12 |
return text
|
|
|
|
| 17 |
|
| 18 |
while i < n:
|
| 19 |
tok = tokens[i]
|
| 20 |
+
single = _single_alpha(tok)
|
| 21 |
+
if single:
|
| 22 |
+
letters = [single]
|
| 23 |
j = i + 1
|
| 24 |
+
while j < n:
|
| 25 |
+
next_single = _single_alpha(tokens[j])
|
| 26 |
+
if not next_single:
|
| 27 |
+
break
|
| 28 |
+
letters.append(next_single)
|
| 29 |
j += 1
|
| 30 |
|
| 31 |
# Join only real obfuscation chains like "g e r i z e k a l i".
|