anchordashboard / pdf_utils_finalclean_airmac_final.py
ntdservices's picture
Update pdf_utils_finalclean_airmac_final.py
1c74735 verified
import re
from pdfminer.high_level import extract_text
from wordfreq import zipf_frequency # pip install wordfreq
# === Artifact filters ===
def is_artifact_word(word):
core = re.sub(r"[^\w]", "", word)
return len(core) > 1 and all(c == core[0] for c in core)
def looks_fully_repeated(word):
core = re.sub(r"[^\w]", "", word)
return bool(re.search(r"(.)\1{2,}", core)) and len(set(re.sub(r"(.)\1+", r"\1", core.lower()))) > 1
def is_entirely_tripled_letters(word):
"""
True iff the word is made only of perfect three-by-three repeats:
e.g. 'SSSTTTRRRAAANNNGG' β†’ True, 'SSSTTTAA' β†’ False.
"""
core = re.sub(r"[^\w]", "", word)
if len(core) == 0 or len(core) % 3:
return False
return all(core[i].lower() == core[i + 1].lower() == core[i + 2].lower()
for i in range(0, len(core), 3))
# === Dedup logic ===
COMMONNESS = 4.5
DUP3_RE = re.compile(r"(.)\1{2,}", flags=re.I)
def dedup(word: str) -> str:
"""
β€’ Collapse any β‰₯3-char runs to doubles (aaa β†’ aa).
β€’ Decide case-by-case whether a remaining double should stay,
using Wordfreq Zipf scores for plausibility.
"""
word = DUP3_RE.sub(lambda m: m.group(1) * 2, word)
out, i = [], 0
while i < len(word):
if i + 1 < len(word) and word[i].lower() == word[i + 1].lower():
keep = "".join(out) + word[i:i + 2] + word[i + 2:]
single = "".join(out) + word[i] + word[i + 2:]
if zipf_frequency(keep.lower(), "en") >= COMMONNESS and \
zipf_frequency(keep.lower(), "en") >= zipf_frequency(single.lower(), "en"):
out.append(word[i] * 2)
else:
out.append(word[i])
i += 2
else:
out.append(word[i])
i += 1
return "".join(out)
# === Main cleanup ===
def clean_text(text):
skip_patterns = [
r"^\[{2,3}\s*DAY\s+BREAK[^\]]*\]{2,3}$", r"^\[{2,3}\s*DAY\s+ENDING[^\]]*\]{2,3}$",
r"^\[{2,3}\s*EVE\s+BREAK[^\]]*\]{2,3}$", r"^\[{2,3}\s*ANCHOR\s*\[[^\]]+\]\]{2,3}$",
r"^\[{2,3}\s*STE[F]?[^\]]*$", r"^\[{2,3}\s*\]{2,3}\s*\]{0,2}$",
r"^COMMERCIAL_DAY_\d+(_[A-Z]+)?$", r"^COMMERCIAL_DAY_\d+$",
r"^GOODBYE[\s_]*$", r"^[A-Z]\d+$", r"^\s*[A-Z]\s*$",
r"^\s*NTD\s+EVE\s*$", r"^\d{1,2}/\d{1,2}/\d{2,4}$", r"^NSR_.*$", r"^CIF_.*$", r"^Morning Show$",
r"\b\d{3,4}\s+\d{1,2}(?:am|pm)\b", r"COMMERCIAL_.*", r"GOODBYE", r"BREAK_",
r"DIRECTOR_ONLY", r"\[TWO SHOT\]", r"\[\s*SOT.*", r"\[TAG\s*\]", r"\[\s*REPORTER.*",
r"NTD COM+ERCIAL", r"DAY_ENDING", r"DAY OPEN",
r"ANCHOR-GREETINGS", r"FADE TO BLACK", r"MD WIPE FROM BLACK", r"Vmix",
r"CUT TO BLACK", r"BLACK_OPEN_DIRECTOR_ONLY", r"BLACK_END_DONT_DELETE",
r"OPEN_DIRECTOR_ONLY", r"No Content", r"Start:", r"End:", r"Printed:",
r"\d+ of \d+", r"(\d{1,2}/\d{1,2}/\d{2,4}),?\s+\d{1,2}:\d{2}\s*(AM|PM|am|pm)?",
r"zzz_.*", r"UK_\d+_.*", r"ENG_\d+_.*", r"COMMERCIAL_DAY_\d+_END",
r"COMMERCIAL_DAY_\d+_BEGIN", r"pkg", r"\[.*?pkg.*?\]", r"\[.*?cam.*?\]",
r"\b[a-zA-Z]\d{2}\b.*-.*", r"nr\.ntdtv\.com", r"^\s*[B-E]\s*$",
r".*w+i+d+e+\s+s+h+o+t+.*", r"^[A-Z0-9]+(?:-[A-Z0-9]+)+$",
r"^[A-Z]{2,}\d+\s+[A-Z0-9]+(?:-[A-Z0-9]+)+$",
]
lines, cleaned = text.splitlines(), []
for line in lines:
normalized = re.sub(r"(.)\1{1,}", r"\1", line, flags=re.I)
if any(re.search(p, normalized, flags=re.I) for p in skip_patterns):
continue
words, new_line = line.split(), []
i = 0
while i < len(words):
# ── detect runs of tripled-letter words ──
if is_entirely_tripled_letters(words[i]):
j = i
while j < len(words) and is_entirely_tripled_letters(words[j]):
j += 1
run_len = j - i
if run_len >= 5: # β‰₯3 β‡’ assume SOT header β†’ DROP
i = j
continue
else: # 1- or 2-word bold span β†’ keep, dedup
for k in range(i, j):
new_line.append(dedup(words[k]))
i = j
continue
# ── normal per-word cleanup ──
w = words[i]
if is_artifact_word(w):
new_line.append(w[0])
elif looks_fully_repeated(w):
new_line.append(dedup(w))
else:
new_line.append(w)
i += 1
final = " ".join(new_line).strip()
if final:
cleaned.append(final)
# Remove stray brackets and blank lines
cleaned_no_brackets = []
for ln in cleaned:
ln = ln.replace("[", "").replace("]", "").strip()
if ln:
cleaned_no_brackets.append(ln)
return "\n".join(cleaned_no_brackets)
# === Final cleanup ===
def apply_textpy_cleanup(text):
patterns_to_skip = [
r"teaser_", r"cold_open", r"\[anchor\]", r"\b\d{4}\s+\d{1,2}(?:am|pm)\b",
r"eve break", r"day break", r"\[\[\[\s*stef", r"\[\[\[\s*\]\]\]", r"goodbye",
r"teaser", r"\[cg\]", r"day ending", r"\[\[\[\s*day ending",
r"commercial_", r"anchor\s*\[\s*close",
# NEW PHRASES TO CUT IF LINE CONTAINS THEM
r"morning show", r"mmoorriinngg\s+sshhooww",
r"fade_to_black", r"cut_to_black", r"db_coming_back_wipe",
r"\(second hour starts now\)", r"hello_intro-cold-open",
r"live-that guy's name-press", r"live-that guy's name-press-round up",
r"2box_lb", r"eve wipe from black", r"ntd_promo", r"ntd eve",
r"eve_ending", r"======graveyard======", r"======templates======",
r"black_end_don't_delete", r"ddaayy bbrreeaakk", r"aanncchhoorr",
r"eevvee bbrreeaakk", r"aanncchhoorr cclloossee uupp",
r"ddaayy eennddiinngg", r"open stef from black",
r"stef segment begins", r"stef segment ends",
r"welcome to n-t-d newsroom. i'm", r"nsr 1s intro wp", r"anchor",
r"liveu_a_2box-b-roll", r"liveu_a_sot", r"comercial",
r"nsr wp from black", r"cc0011 w open stef from black",
r"stef w segment begins", r"stef w segment ends"
]
cleaned = [ln for ln in text.splitlines()
if not any(re.search(p, ln.lower()) for p in patterns_to_skip)]
return "\n".join(cleaned)
# === Glue function ===
def pdf_to_final_cleaned_text(pdf_path):
raw_text = extract_text(pdf_path)
stage1 = clean_text(raw_text)
return apply_textpy_cleanup(stage1)