ntdservices commited on
Commit
0ebe26d
·
verified ·
1 Parent(s): be01353

Update pdf_utils_finalclean_NYmac_final.py

Browse files
pdf_utils_finalclean_NYmac_final.py CHANGED
@@ -80,7 +80,7 @@ def clean_text(text: str) -> str:
80
  while j < len(words) and is_entirely_double_letters(words[j]):
81
  j += 1
82
  run_len = j - i
83
- if run_len >= 3: # ≥3 consecutive doubled words → drop run
84
  i = j
85
  continue
86
  else: # 1- or 2-word run → keep, dedup for readability
 
80
  while j < len(words) and is_entirely_double_letters(words[j]):
81
  j += 1
82
  run_len = j - i
83
+ if run_len >= 5: # ≥3 consecutive doubled words → drop run
84
  i = j
85
  continue
86
  else: # 1- or 2-word run → keep, dedup for readability