Mustafa Öztürk commited on
Commit
cce12b3
·
1 Parent(s): 67e1e39

Normalize spaced-letter obfuscation in text cleaning

Browse files
Files changed (1) hide show
  1. app/utils/text_utils.py +34 -1
app/utils/text_utils.py CHANGED
@@ -2,6 +2,38 @@ import re
2
  import unicodedata
3
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  def clean_text_nfkc(text: str) -> str:
6
  text = unicodedata.normalize('NFKC', str(text))
7
  text = text.replace('İ', 'i').replace('I', 'ı').lower()
@@ -11,7 +43,8 @@ def clean_text_nfkc(text: str) -> str:
11
  text = text.replace(key, value)
12
  # Keep natural double letters (e.g., "kullanici") and only squash exaggerated repeats.
13
  text = re.sub(r'(.)\1{2,}', r'\1', text)
14
- return " ".join(text.split())
 
15
 
16
 
17
  def check_blacklist(text: str, blacklist_set: set) -> bool:
 
2
  import unicodedata
3
 
4
 
5
+ def _merge_spaced_letter_chains(text: str) -> str:
6
+ tokens = text.split()
7
+ if not tokens:
8
+ return text
9
+
10
+ merged = []
11
+ i = 0
12
+ n = len(tokens)
13
+
14
+ while i < n:
15
+ tok = tokens[i]
16
+ if len(tok) == 1 and tok.isalpha():
17
+ letters = [tok]
18
+ j = i + 1
19
+ while j < n and len(tokens[j]) == 1 and tokens[j].isalpha():
20
+ letters.append(tokens[j])
21
+ j += 1
22
+
23
+ # Join only real obfuscation chains like "g e r i z e k a l i".
24
+ if len(letters) >= 2:
25
+ merged.append("".join(letters))
26
+ else:
27
+ merged.append(tok)
28
+ i = j
29
+ continue
30
+
31
+ merged.append(tok)
32
+ i += 1
33
+
34
+ return " ".join(merged)
35
+
36
+
37
  def clean_text_nfkc(text: str) -> str:
38
  text = unicodedata.normalize('NFKC', str(text))
39
  text = text.replace('İ', 'i').replace('I', 'ı').lower()
 
43
  text = text.replace(key, value)
44
  # Keep natural double letters (e.g., "kullanici") and only squash exaggerated repeats.
45
  text = re.sub(r'(.)\1{2,}', r'\1', text)
46
+ text = " ".join(text.split())
47
+ return _merge_spaced_letter_chains(text)
48
 
49
 
50
  def check_blacklist(text: str, blacklist_set: set) -> bool: