Mustafa Öztürk commited on
Commit
ddd2daa
·
1 Parent(s): c6cc11a

Handle quoted spaced-letter obfuscation in normalization

Browse files
Files changed (1) hide show
  1. app/utils/text_utils.py +12 -4
app/utils/text_utils.py CHANGED
@@ -3,6 +3,10 @@ import unicodedata
3
 
4
 
5
  def _merge_spaced_letter_chains(text: str) -> str:
 
 
 
 
6
  tokens = text.split()
7
  if not tokens:
8
  return text
@@ -13,11 +17,15 @@ def _merge_spaced_letter_chains(text: str) -> str:
13
 
14
  while i < n:
15
  tok = tokens[i]
16
- if len(tok) == 1 and tok.isalpha():
17
- letters = [tok]
 
18
  j = i + 1
19
- while j < n and len(tokens[j]) == 1 and tokens[j].isalpha():
20
- letters.append(tokens[j])
 
 
 
21
  j += 1
22
 
23
  # Join only real obfuscation chains like "g e r i z e k a l i".
 
3
 
4
 
5
  def _merge_spaced_letter_chains(text: str) -> str:
6
+ def _single_alpha(tok: str) -> str:
7
+ cleaned = re.sub(r"[^a-zA-ZçğıöşüÇĞİÖŞÜ]", "", tok)
8
+ return cleaned if len(cleaned) == 1 and cleaned.isalpha() else ""
9
+
10
  tokens = text.split()
11
  if not tokens:
12
  return text
 
17
 
18
  while i < n:
19
  tok = tokens[i]
20
+ single = _single_alpha(tok)
21
+ if single:
22
+ letters = [single]
23
  j = i + 1
24
+ while j < n:
25
+ next_single = _single_alpha(tokens[j])
26
+ if not next_single:
27
+ break
28
+ letters.append(next_single)
29
  j += 1
30
 
31
  # Join only real obfuscation chains like "g e r i z e k a l i".