ntdservices commited on
Commit
1c74735
Β·
verified Β·
1 Parent(s): 0ebe26d

Update pdf_utils_finalclean_airmac_final.py

Browse files
Files changed (1) hide show
  1. pdf_utils_finalclean_airmac_final.py +52 -31
pdf_utils_finalclean_airmac_final.py CHANGED
@@ -10,32 +10,40 @@ def is_artifact_word(word):
10
 
11
  def looks_fully_repeated(word):
12
  core = re.sub(r"[^\w]", "", word)
13
- return bool(re.search(r'(.)\1{2,}', core)) and len(set(re.sub(r'(.)\1+', r'\1', core.lower()))) > 1
14
 
15
  def is_entirely_tripled_letters(word):
 
 
 
 
16
  core = re.sub(r"[^\w]", "", word)
17
- if len(core) % 3 != 0 or len(core) == 0:
18
  return False
19
- return all(core[i].lower() == core[i+1].lower() == core[i+2].lower()
20
  for i in range(0, len(core), 3))
21
 
22
-
23
  # === Dedup logic ===
24
 
25
  COMMONNESS = 4.5
26
  DUP3_RE = re.compile(r"(.)\1{2,}", flags=re.I)
27
- PAIR_RE = re.compile(r"(.)\1", flags=re.I)
28
 
29
  def dedup(word: str) -> str:
30
- word = DUP3_RE.sub(lambda m: m.group(1)*2, word)
 
 
 
 
 
 
31
  out, i = [], 0
32
  while i < len(word):
33
- if i+1 < len(word) and word[i].lower() == word[i+1].lower():
34
- keep = "".join(out) + word[i:i+2] + word[i+2:]
35
- single = "".join(out) + word[i] + word[i+2:]
36
  if zipf_frequency(keep.lower(), "en") >= COMMONNESS and \
37
  zipf_frequency(keep.lower(), "en") >= zipf_frequency(single.lower(), "en"):
38
- out.append(word[i]*2)
39
  else:
40
  out.append(word[i])
41
  i += 2
@@ -44,7 +52,6 @@ def dedup(word: str) -> str:
44
  i += 1
45
  return "".join(out)
46
 
47
-
48
  # === Main cleanup ===
49
 
50
  def clean_text(text):
@@ -74,32 +81,50 @@ def clean_text(text):
74
  normalized = re.sub(r"(.)\1{1,}", r"\1", line, flags=re.I)
75
  if any(re.search(p, normalized, flags=re.I) for p in skip_patterns):
76
  continue
 
77
  words, new_line = line.split(), []
78
- for w in words:
79
- if is_entirely_tripled_letters(w):
80
- continue # remove junk like ttthhhiiisss
81
- elif is_artifact_word(w):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  new_line.append(w[0])
83
  elif looks_fully_repeated(w):
84
  new_line.append(dedup(w))
85
  else:
86
  new_line.append(w)
 
 
87
  final = " ".join(new_line).strip()
88
  if final:
89
  cleaned.append(final)
90
- # Remove brackets from each line and drop empty results
 
91
  cleaned_no_brackets = []
92
- for line in cleaned:
93
- line = line.replace("[", "").replace("]", "").strip()
94
- if line: # keep only non-empty lines
95
- cleaned_no_brackets.append(line)
96
 
97
  return "\n".join(cleaned_no_brackets)
98
 
 
99
 
100
-
101
-
102
- # Final cleanup
103
  def apply_textpy_cleanup(text):
104
  patterns_to_skip = [
105
  r"teaser_", r"cold_open", r"\[anchor\]", r"\b\d{4}\s+\d{1,2}(?:am|pm)\b",
@@ -124,17 +149,13 @@ def apply_textpy_cleanup(text):
124
  r"stef w segment begins", r"stef w segment ends"
125
  ]
126
 
127
- lines = text.splitlines()
128
- cleaned = []
129
- for line in lines:
130
- if any(re.search(p, line.lower()) for p in patterns_to_skip):
131
- continue
132
- cleaned.append(line)
133
  return "\n".join(cleaned)
134
 
 
135
 
136
- # Combine everything
137
  def pdf_to_final_cleaned_text(pdf_path):
138
  raw_text = extract_text(pdf_path)
139
- stage1 = clean_text(raw_text)
140
  return apply_textpy_cleanup(stage1)
 
10
 
11
  def looks_fully_repeated(word):
12
  core = re.sub(r"[^\w]", "", word)
13
+ return bool(re.search(r"(.)\1{2,}", core)) and len(set(re.sub(r"(.)\1+", r"\1", core.lower()))) > 1
14
 
15
  def is_entirely_tripled_letters(word):
16
+ """
17
+ True iff the word is made only of perfect three-by-three repeats:
18
+ e.g. 'SSSTTTRRRAAANNNGG' β†’ True, 'SSSTTTAA' β†’ False.
19
+ """
20
  core = re.sub(r"[^\w]", "", word)
21
+ if len(core) == 0 or len(core) % 3:
22
  return False
23
+ return all(core[i].lower() == core[i + 1].lower() == core[i + 2].lower()
24
  for i in range(0, len(core), 3))
25
 
 
26
  # === Dedup logic ===
27
 
28
  COMMONNESS = 4.5
29
  DUP3_RE = re.compile(r"(.)\1{2,}", flags=re.I)
 
30
 
31
  def dedup(word: str) -> str:
32
+ """
33
+ β€’ Collapse any β‰₯3-char runs to doubles (aaa β†’ aa).
34
+ β€’ Decide case-by-case whether a remaining double should stay,
35
+ using Wordfreq Zipf scores for plausibility.
36
+ """
37
+ word = DUP3_RE.sub(lambda m: m.group(1) * 2, word)
38
+
39
  out, i = [], 0
40
  while i < len(word):
41
+ if i + 1 < len(word) and word[i].lower() == word[i + 1].lower():
42
+ keep = "".join(out) + word[i:i + 2] + word[i + 2:]
43
+ single = "".join(out) + word[i] + word[i + 2:]
44
  if zipf_frequency(keep.lower(), "en") >= COMMONNESS and \
45
  zipf_frequency(keep.lower(), "en") >= zipf_frequency(single.lower(), "en"):
46
+ out.append(word[i] * 2)
47
  else:
48
  out.append(word[i])
49
  i += 2
 
52
  i += 1
53
  return "".join(out)
54
 
 
55
  # === Main cleanup ===
56
 
57
  def clean_text(text):
 
81
  normalized = re.sub(r"(.)\1{1,}", r"\1", line, flags=re.I)
82
  if any(re.search(p, normalized, flags=re.I) for p in skip_patterns):
83
  continue
84
+
85
  words, new_line = line.split(), []
86
+ i = 0
87
+ while i < len(words):
88
+ # ── detect runs of tripled-letter words ──
89
+ if is_entirely_tripled_letters(words[i]):
90
+ j = i
91
+ while j < len(words) and is_entirely_tripled_letters(words[j]):
92
+ j += 1
93
+ run_len = j - i
94
+ if run_len >= 5: # β‰₯3 β‡’ assume SOT header β†’ DROP
95
+ i = j
96
+ continue
97
+ else: # 1- or 2-word bold span β†’ keep, dedup
98
+ for k in range(i, j):
99
+ new_line.append(dedup(words[k]))
100
+ i = j
101
+ continue
102
+
103
+ # ── normal per-word cleanup ──
104
+ w = words[i]
105
+ if is_artifact_word(w):
106
  new_line.append(w[0])
107
  elif looks_fully_repeated(w):
108
  new_line.append(dedup(w))
109
  else:
110
  new_line.append(w)
111
+ i += 1
112
+
113
  final = " ".join(new_line).strip()
114
  if final:
115
  cleaned.append(final)
116
+
117
+ # Remove stray brackets and blank lines
118
  cleaned_no_brackets = []
119
+ for ln in cleaned:
120
+ ln = ln.replace("[", "").replace("]", "").strip()
121
+ if ln:
122
+ cleaned_no_brackets.append(ln)
123
 
124
  return "\n".join(cleaned_no_brackets)
125
 
126
+ # === Final cleanup ===
127
 
 
 
 
128
  def apply_textpy_cleanup(text):
129
  patterns_to_skip = [
130
  r"teaser_", r"cold_open", r"\[anchor\]", r"\b\d{4}\s+\d{1,2}(?:am|pm)\b",
 
149
  r"stef w segment begins", r"stef w segment ends"
150
  ]
151
 
152
+ cleaned = [ln for ln in text.splitlines()
153
+ if not any(re.search(p, ln.lower()) for p in patterns_to_skip)]
 
 
 
 
154
  return "\n".join(cleaned)
155
 
156
+ # === Glue function ===
157
 
 
158
  def pdf_to_final_cleaned_text(pdf_path):
159
  raw_text = extract_text(pdf_path)
160
+ stage1 = clean_text(raw_text)
161
  return apply_textpy_cleanup(stage1)