Spaces:
Sleeping
Sleeping
Update pdf_utils_finalclean_airmac_final.py
Browse files
pdf_utils_finalclean_airmac_final.py
CHANGED
|
@@ -10,32 +10,40 @@ def is_artifact_word(word):
|
|
| 10 |
|
| 11 |
def looks_fully_repeated(word):
|
| 12 |
core = re.sub(r"[^\w]", "", word)
|
| 13 |
-
return bool(re.search(r
|
| 14 |
|
| 15 |
def is_entirely_tripled_letters(word):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
core = re.sub(r"[^\w]", "", word)
|
| 17 |
-
if len(core)
|
| 18 |
return False
|
| 19 |
-
return all(core[i].lower() == core[i+1].lower() == core[i+2].lower()
|
| 20 |
for i in range(0, len(core), 3))
|
| 21 |
|
| 22 |
-
|
| 23 |
# === Dedup logic ===
|
| 24 |
|
| 25 |
COMMONNESS = 4.5
|
| 26 |
DUP3_RE = re.compile(r"(.)\1{2,}", flags=re.I)
|
| 27 |
-
PAIR_RE = re.compile(r"(.)\1", flags=re.I)
|
| 28 |
|
| 29 |
def dedup(word: str) -> str:
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
out, i = [], 0
|
| 32 |
while i < len(word):
|
| 33 |
-
if i+1 < len(word) and word[i].lower() == word[i+1].lower():
|
| 34 |
-
keep = "".join(out) + word[i:i+2] + word[i+2:]
|
| 35 |
-
single = "".join(out) + word[i] + word[i+2:]
|
| 36 |
if zipf_frequency(keep.lower(), "en") >= COMMONNESS and \
|
| 37 |
zipf_frequency(keep.lower(), "en") >= zipf_frequency(single.lower(), "en"):
|
| 38 |
-
out.append(word[i]*2)
|
| 39 |
else:
|
| 40 |
out.append(word[i])
|
| 41 |
i += 2
|
|
@@ -44,7 +52,6 @@ def dedup(word: str) -> str:
|
|
| 44 |
i += 1
|
| 45 |
return "".join(out)
|
| 46 |
|
| 47 |
-
|
| 48 |
# === Main cleanup ===
|
| 49 |
|
| 50 |
def clean_text(text):
|
|
@@ -74,32 +81,50 @@ def clean_text(text):
|
|
| 74 |
normalized = re.sub(r"(.)\1{1,}", r"\1", line, flags=re.I)
|
| 75 |
if any(re.search(p, normalized, flags=re.I) for p in skip_patterns):
|
| 76 |
continue
|
|
|
|
| 77 |
words, new_line = line.split(), []
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
new_line.append(w[0])
|
| 83 |
elif looks_fully_repeated(w):
|
| 84 |
new_line.append(dedup(w))
|
| 85 |
else:
|
| 86 |
new_line.append(w)
|
|
|
|
|
|
|
| 87 |
final = " ".join(new_line).strip()
|
| 88 |
if final:
|
| 89 |
cleaned.append(final)
|
| 90 |
-
|
|
|
|
| 91 |
cleaned_no_brackets = []
|
| 92 |
-
for
|
| 93 |
-
|
| 94 |
-
if
|
| 95 |
-
cleaned_no_brackets.append(
|
| 96 |
|
| 97 |
return "\n".join(cleaned_no_brackets)
|
| 98 |
|
|
|
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
# Final cleanup
|
| 103 |
def apply_textpy_cleanup(text):
|
| 104 |
patterns_to_skip = [
|
| 105 |
r"teaser_", r"cold_open", r"\[anchor\]", r"\b\d{4}\s+\d{1,2}(?:am|pm)\b",
|
|
@@ -124,17 +149,13 @@ def apply_textpy_cleanup(text):
|
|
| 124 |
r"stef w segment begins", r"stef w segment ends"
|
| 125 |
]
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
for line in lines:
|
| 130 |
-
if any(re.search(p, line.lower()) for p in patterns_to_skip):
|
| 131 |
-
continue
|
| 132 |
-
cleaned.append(line)
|
| 133 |
return "\n".join(cleaned)
|
| 134 |
|
|
|
|
| 135 |
|
| 136 |
-
# Combine everything
|
| 137 |
def pdf_to_final_cleaned_text(pdf_path):
|
| 138 |
raw_text = extract_text(pdf_path)
|
| 139 |
-
stage1
|
| 140 |
return apply_textpy_cleanup(stage1)
|
|
|
|
| 10 |
|
| 11 |
def looks_fully_repeated(word):
|
| 12 |
core = re.sub(r"[^\w]", "", word)
|
| 13 |
+
return bool(re.search(r"(.)\1{2,}", core)) and len(set(re.sub(r"(.)\1+", r"\1", core.lower()))) > 1
|
| 14 |
|
| 15 |
def is_entirely_tripled_letters(word):
|
| 16 |
+
"""
|
| 17 |
+
True iff the word is made only of perfect three-by-three repeats:
|
| 18 |
+
e.g. 'SSSTTTRRRAAANNNGG' β True, 'SSSTTTAA' β False.
|
| 19 |
+
"""
|
| 20 |
core = re.sub(r"[^\w]", "", word)
|
| 21 |
+
if len(core) == 0 or len(core) % 3:
|
| 22 |
return False
|
| 23 |
+
return all(core[i].lower() == core[i + 1].lower() == core[i + 2].lower()
|
| 24 |
for i in range(0, len(core), 3))
|
| 25 |
|
|
|
|
| 26 |
# === Dedup logic ===
|
| 27 |
|
| 28 |
COMMONNESS = 4.5
|
| 29 |
DUP3_RE = re.compile(r"(.)\1{2,}", flags=re.I)
|
|
|
|
| 30 |
|
| 31 |
def dedup(word: str) -> str:
|
| 32 |
+
"""
|
| 33 |
+
β’ Collapse any β₯3-char runs to doubles (aaa β aa).
|
| 34 |
+
β’ Decide case-by-case whether a remaining double should stay,
|
| 35 |
+
using Wordfreq Zipf scores for plausibility.
|
| 36 |
+
"""
|
| 37 |
+
word = DUP3_RE.sub(lambda m: m.group(1) * 2, word)
|
| 38 |
+
|
| 39 |
out, i = [], 0
|
| 40 |
while i < len(word):
|
| 41 |
+
if i + 1 < len(word) and word[i].lower() == word[i + 1].lower():
|
| 42 |
+
keep = "".join(out) + word[i:i + 2] + word[i + 2:]
|
| 43 |
+
single = "".join(out) + word[i] + word[i + 2:]
|
| 44 |
if zipf_frequency(keep.lower(), "en") >= COMMONNESS and \
|
| 45 |
zipf_frequency(keep.lower(), "en") >= zipf_frequency(single.lower(), "en"):
|
| 46 |
+
out.append(word[i] * 2)
|
| 47 |
else:
|
| 48 |
out.append(word[i])
|
| 49 |
i += 2
|
|
|
|
| 52 |
i += 1
|
| 53 |
return "".join(out)
|
| 54 |
|
|
|
|
| 55 |
# === Main cleanup ===
|
| 56 |
|
| 57 |
def clean_text(text):
|
|
|
|
| 81 |
normalized = re.sub(r"(.)\1{1,}", r"\1", line, flags=re.I)
|
| 82 |
if any(re.search(p, normalized, flags=re.I) for p in skip_patterns):
|
| 83 |
continue
|
| 84 |
+
|
| 85 |
words, new_line = line.split(), []
|
| 86 |
+
i = 0
|
| 87 |
+
while i < len(words):
|
| 88 |
+
# ββ detect runs of tripled-letter words ββ
|
| 89 |
+
if is_entirely_tripled_letters(words[i]):
|
| 90 |
+
j = i
|
| 91 |
+
while j < len(words) and is_entirely_tripled_letters(words[j]):
|
| 92 |
+
j += 1
|
| 93 |
+
run_len = j - i
|
| 94 |
+
if run_len >= 5: # β₯3 β assume SOT header β DROP
|
| 95 |
+
i = j
|
| 96 |
+
continue
|
| 97 |
+
else: # 1- or 2-word bold span β keep, dedup
|
| 98 |
+
for k in range(i, j):
|
| 99 |
+
new_line.append(dedup(words[k]))
|
| 100 |
+
i = j
|
| 101 |
+
continue
|
| 102 |
+
|
| 103 |
+
# ββ normal per-word cleanup ββ
|
| 104 |
+
w = words[i]
|
| 105 |
+
if is_artifact_word(w):
|
| 106 |
new_line.append(w[0])
|
| 107 |
elif looks_fully_repeated(w):
|
| 108 |
new_line.append(dedup(w))
|
| 109 |
else:
|
| 110 |
new_line.append(w)
|
| 111 |
+
i += 1
|
| 112 |
+
|
| 113 |
final = " ".join(new_line).strip()
|
| 114 |
if final:
|
| 115 |
cleaned.append(final)
|
| 116 |
+
|
| 117 |
+
# Remove stray brackets and blank lines
|
| 118 |
cleaned_no_brackets = []
|
| 119 |
+
for ln in cleaned:
|
| 120 |
+
ln = ln.replace("[", "").replace("]", "").strip()
|
| 121 |
+
if ln:
|
| 122 |
+
cleaned_no_brackets.append(ln)
|
| 123 |
|
| 124 |
return "\n".join(cleaned_no_brackets)
|
| 125 |
|
| 126 |
+
# === Final cleanup ===
|
| 127 |
|
|
|
|
|
|
|
|
|
|
| 128 |
def apply_textpy_cleanup(text):
|
| 129 |
patterns_to_skip = [
|
| 130 |
r"teaser_", r"cold_open", r"\[anchor\]", r"\b\d{4}\s+\d{1,2}(?:am|pm)\b",
|
|
|
|
| 149 |
r"stef w segment begins", r"stef w segment ends"
|
| 150 |
]
|
| 151 |
|
| 152 |
+
cleaned = [ln for ln in text.splitlines()
|
| 153 |
+
if not any(re.search(p, ln.lower()) for p in patterns_to_skip)]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
return "\n".join(cleaned)
|
| 155 |
|
| 156 |
+
# === Glue function ===
|
| 157 |
|
|
|
|
| 158 |
def pdf_to_final_cleaned_text(pdf_path):
|
| 159 |
raw_text = extract_text(pdf_path)
|
| 160 |
+
stage1 = clean_text(raw_text)
|
| 161 |
return apply_textpy_cleanup(stage1)
|