Spaces:

ntdservices
/

anchordashboard

Sleeping

App Files Files Community

anchordashboard / pdf_utils_finalclean_airmac_final.py

ntdservices

Update pdf_utils_finalclean_airmac_final.py

1c74735 verified 9 months ago

raw

history blame contribute delete

6.62 kB

	import re
	from pdfminer.high_level import extract_text
	from wordfreq import zipf_frequency # pip install wordfreq

	# === Artifact filters ===

	def is_artifact_word(word):
	core = re.sub(r"[^\w]", "", word)
	return len(core) > 1 and all(c == core[0] for c in core)

	def looks_fully_repeated(word):
	core = re.sub(r"[^\w]", "", word)
	return bool(re.search(r"(.)\1{2,}", core)) and len(set(re.sub(r"(.)\1+", r"\1", core.lower()))) > 1

	def is_entirely_tripled_letters(word):
	"""
	True iff the word is made only of perfect three-by-three repeats:
	e.g. 'SSSTTTRRRAAANNNGG' → True, 'SSSTTTAA' → False.
	"""
	core = re.sub(r"[^\w]", "", word)
	if len(core) == 0 or len(core) % 3:
	return False
	return all(core[i].lower() == core[i + 1].lower() == core[i + 2].lower()
	for i in range(0, len(core), 3))

	# === Dedup logic ===

	COMMONNESS = 4.5
	DUP3_RE = re.compile(r"(.)\1{2,}", flags=re.I)

	def dedup(word: str) -> str:
	"""
	• Collapse any ≥3-char runs to doubles (aaa → aa).
	• Decide case-by-case whether a remaining double should stay,
	using Wordfreq Zipf scores for plausibility.
	"""
	word = DUP3_RE.sub(lambda m: m.group(1) * 2, word)

	out, i = [], 0
	while i < len(word):
	if i + 1 < len(word) and word[i].lower() == word[i + 1].lower():
	keep = "".join(out) + word[i:i + 2] + word[i + 2:]
	single = "".join(out) + word[i] + word[i + 2:]
	if zipf_frequency(keep.lower(), "en") >= COMMONNESS and \
	zipf_frequency(keep.lower(), "en") >= zipf_frequency(single.lower(), "en"):
	out.append(word[i] * 2)
	else:
	out.append(word[i])
	i += 2
	else:
	out.append(word[i])
	i += 1
	return "".join(out)

	# === Main cleanup ===

	def clean_text(text):
	skip_patterns = [
	r"^\[{2,3}\sDAY\s+BREAK[^\]]\]{2,3}$", r"^\[{2,3}\sDAY\s+ENDING[^\]]\]{2,3}$",
	r"^\[{2,3}\sEVE\s+BREAK[^\]]\]{2,3}$", r"^\[{2,3}\sANCHOR\s\[[^\]]+\]\]{2,3}$",
	r"^\[{2,3}\sSTE[F]?[^\]]$", r"^\[{2,3}\s\]{2,3}\s\]{0,2}$",
	r"^COMMERCIAL_DAY_\d+(_[A-Z]+)?$", r"^COMMERCIAL_DAY_\d+$",
	r"^GOODBYE[\s_]$", r"^[A-Z]\d+$", r"^\s[A-Z]\s*$",
	r"^\sNTD\s+EVE\s$", r"^\d{1,2}/\d{1,2}/\d{2,4}$", r"^NSR_.$", r"^CIF_.$", r"^Morning Show$",
	r"\b\d{3,4}\s+\d{1,2}(?:am\|pm)\b", r"COMMERCIAL_.*", r"GOODBYE", r"BREAK_",
	r"DIRECTOR_ONLY", r"\[TWO SHOT\]", r"\[\sSOT.", r"\[TAG\s\]", r"\[\sREPORTER.*",
	r"NTD COM+ERCIAL", r"DAY_ENDING", r"DAY OPEN",
	r"ANCHOR-GREETINGS", r"FADE TO BLACK", r"MD WIPE FROM BLACK", r"Vmix",
	r"CUT TO BLACK", r"BLACK_OPEN_DIRECTOR_ONLY", r"BLACK_END_DONT_DELETE",
	r"OPEN_DIRECTOR_ONLY", r"No Content", r"Start:", r"End:", r"Printed:",
	r"\d+ of \d+", r"(\d{1,2}/\d{1,2}/\d{2,4}),?\s+\d{1,2}:\d{2}\s*(AM\|PM\|am\|pm)?",
	r"zzz_.", r"UK_\d+_.", r"ENG_\d+_.*", r"COMMERCIAL_DAY_\d+_END",
	r"COMMERCIAL_DAY_\d+_BEGIN", r"pkg", r"\[.?pkg.?\]", r"\[.?cam.?\]",
	r"\b[a-zA-Z]\d{2}\b.-.", r"nr\.ntdtv\.com", r"^\s[B-E]\s$",
	r".w+i+d+e+\s+s+h+o+t+.", r"^[A-Z0-9]+(?:-[A-Z0-9]+)+$",
	r"^[A-Z]{2,}\d+\s+[A-Z0-9]+(?:-[A-Z0-9]+)+$",
	]

	lines, cleaned = text.splitlines(), []
	for line in lines:
	normalized = re.sub(r"(.)\1{1,}", r"\1", line, flags=re.I)
	if any(re.search(p, normalized, flags=re.I) for p in skip_patterns):
	continue

	words, new_line = line.split(), []
	i = 0
	while i < len(words):
	# ── detect runs of tripled-letter words ──
	if is_entirely_tripled_letters(words[i]):
	j = i
	while j < len(words) and is_entirely_tripled_letters(words[j]):
	j += 1
	run_len = j - i
	if run_len >= 5: # ≥3 ⇒ assume SOT header → DROP
	i = j
	continue
	else: # 1- or 2-word bold span → keep, dedup
	for k in range(i, j):
	new_line.append(dedup(words[k]))
	i = j
	continue

	# ── normal per-word cleanup ──
	w = words[i]
	if is_artifact_word(w):
	new_line.append(w[0])
	elif looks_fully_repeated(w):
	new_line.append(dedup(w))
	else:
	new_line.append(w)
	i += 1

	final = " ".join(new_line).strip()
	if final:
	cleaned.append(final)

	# Remove stray brackets and blank lines
	cleaned_no_brackets = []
	for ln in cleaned:
	ln = ln.replace("[", "").replace("]", "").strip()
	if ln:
	cleaned_no_brackets.append(ln)

	return "\n".join(cleaned_no_brackets)

	# === Final cleanup ===

	def apply_textpy_cleanup(text):
	patterns_to_skip = [
	r"teaser_", r"cold_open", r"\[anchor\]", r"\b\d{4}\s+\d{1,2}(?:am\|pm)\b",
	r"eve break", r"day break", r"\[\[\[\sstef", r"\[\[\[\s\]\]\]", r"goodbye",
	r"teaser", r"\[cg\]", r"day ending", r"\[\[\[\s*day ending",
	r"commercial_", r"anchor\s\[\sclose",

	# NEW PHRASES TO CUT IF LINE CONTAINS THEM
	r"morning show", r"mmoorriinngg\s+sshhooww",
	r"fade_to_black", r"cut_to_black", r"db_coming_back_wipe",
	r"$second hour starts now$", r"hello_intro-cold-open",
	r"live-that guy's name-press", r"live-that guy's name-press-round up",
	r"2box_lb", r"eve wipe from black", r"ntd_promo", r"ntd eve",
	r"eve_ending", r"======graveyard======", r"======templates======",
	r"black_end_don't_delete", r"ddaayy bbrreeaakk", r"aanncchhoorr",
	r"eevvee bbrreeaakk", r"aanncchhoorr cclloossee uupp",
	r"ddaayy eennddiinngg", r"open stef from black",
	r"stef segment begins", r"stef segment ends",
	r"welcome to n-t-d newsroom. i'm", r"nsr 1s intro wp", r"anchor",
	r"liveu_a_2box-b-roll", r"liveu_a_sot", r"comercial",
	r"nsr wp from black", r"cc0011 w open stef from black",
	r"stef w segment begins", r"stef w segment ends"
	]

	cleaned = [ln for ln in text.splitlines()
	if not any(re.search(p, ln.lower()) for p in patterns_to_skip)]
	return "\n".join(cleaned)

	# === Glue function ===

	def pdf_to_final_cleaned_text(pdf_path):
	raw_text = extract_text(pdf_path)
	stage1 = clean_text(raw_text)
	return apply_textpy_cleanup(stage1)