Spaces:
Sleeping
Sleeping
| import re | |
| from pdfminer.high_level import extract_text | |
| from wordfreq import zipf_frequency # pip install wordfreq | |
| # === Artifact filters === | |
| def is_artifact_word(word): | |
| core = re.sub(r"[^\w]", "", word) | |
| return len(core) > 1 and all(c == core[0] for c in core) | |
| def looks_fully_repeated(word): | |
| core = re.sub(r"[^\w]", "", word) | |
| return bool(re.search(r"(.)\1{2,}", core)) and len(set(re.sub(r"(.)\1+", r"\1", core.lower()))) > 1 | |
| def is_entirely_tripled_letters(word): | |
| """ | |
| True iff the word is made only of perfect three-by-three repeats: | |
| e.g. 'SSSTTTRRRAAANNNGG' β True, 'SSSTTTAA' β False. | |
| """ | |
| core = re.sub(r"[^\w]", "", word) | |
| if len(core) == 0 or len(core) % 3: | |
| return False | |
| return all(core[i].lower() == core[i + 1].lower() == core[i + 2].lower() | |
| for i in range(0, len(core), 3)) | |
| # === Dedup logic === | |
| COMMONNESS = 4.5 | |
| DUP3_RE = re.compile(r"(.)\1{2,}", flags=re.I) | |
| def dedup(word: str) -> str: | |
| """ | |
| β’ Collapse any β₯3-char runs to doubles (aaa β aa). | |
| β’ Decide case-by-case whether a remaining double should stay, | |
| using Wordfreq Zipf scores for plausibility. | |
| """ | |
| word = DUP3_RE.sub(lambda m: m.group(1) * 2, word) | |
| out, i = [], 0 | |
| while i < len(word): | |
| if i + 1 < len(word) and word[i].lower() == word[i + 1].lower(): | |
| keep = "".join(out) + word[i:i + 2] + word[i + 2:] | |
| single = "".join(out) + word[i] + word[i + 2:] | |
| if zipf_frequency(keep.lower(), "en") >= COMMONNESS and \ | |
| zipf_frequency(keep.lower(), "en") >= zipf_frequency(single.lower(), "en"): | |
| out.append(word[i] * 2) | |
| else: | |
| out.append(word[i]) | |
| i += 2 | |
| else: | |
| out.append(word[i]) | |
| i += 1 | |
| return "".join(out) | |
| # === Main cleanup === | |
| def clean_text(text): | |
| skip_patterns = [ | |
| r"^\[{2,3}\s*DAY\s+BREAK[^\]]*\]{2,3}$", r"^\[{2,3}\s*DAY\s+ENDING[^\]]*\]{2,3}$", | |
| r"^\[{2,3}\s*EVE\s+BREAK[^\]]*\]{2,3}$", r"^\[{2,3}\s*ANCHOR\s*\[[^\]]+\]\]{2,3}$", | |
| r"^\[{2,3}\s*STE[F]?[^\]]*$", r"^\[{2,3}\s*\]{2,3}\s*\]{0,2}$", | |
| r"^COMMERCIAL_DAY_\d+(_[A-Z]+)?$", r"^COMMERCIAL_DAY_\d+$", | |
| r"^GOODBYE[\s_]*$", r"^[A-Z]\d+$", r"^\s*[A-Z]\s*$", | |
| r"^\s*NTD\s+EVE\s*$", r"^\d{1,2}/\d{1,2}/\d{2,4}$", r"^NSR_.*$", r"^CIF_.*$", r"^Morning Show$", | |
| r"\b\d{3,4}\s+\d{1,2}(?:am|pm)\b", r"COMMERCIAL_.*", r"GOODBYE", r"BREAK_", | |
| r"DIRECTOR_ONLY", r"\[TWO SHOT\]", r"\[\s*SOT.*", r"\[TAG\s*\]", r"\[\s*REPORTER.*", | |
| r"NTD COM+ERCIAL", r"DAY_ENDING", r"DAY OPEN", | |
| r"ANCHOR-GREETINGS", r"FADE TO BLACK", r"MD WIPE FROM BLACK", r"Vmix", | |
| r"CUT TO BLACK", r"BLACK_OPEN_DIRECTOR_ONLY", r"BLACK_END_DONT_DELETE", | |
| r"OPEN_DIRECTOR_ONLY", r"No Content", r"Start:", r"End:", r"Printed:", | |
| r"\d+ of \d+", r"(\d{1,2}/\d{1,2}/\d{2,4}),?\s+\d{1,2}:\d{2}\s*(AM|PM|am|pm)?", | |
| r"zzz_.*", r"UK_\d+_.*", r"ENG_\d+_.*", r"COMMERCIAL_DAY_\d+_END", | |
| r"COMMERCIAL_DAY_\d+_BEGIN", r"pkg", r"\[.*?pkg.*?\]", r"\[.*?cam.*?\]", | |
| r"\b[a-zA-Z]\d{2}\b.*-.*", r"nr\.ntdtv\.com", r"^\s*[B-E]\s*$", | |
| r".*w+i+d+e+\s+s+h+o+t+.*", r"^[A-Z0-9]+(?:-[A-Z0-9]+)+$", | |
| r"^[A-Z]{2,}\d+\s+[A-Z0-9]+(?:-[A-Z0-9]+)+$", | |
| ] | |
| lines, cleaned = text.splitlines(), [] | |
| for line in lines: | |
| normalized = re.sub(r"(.)\1{1,}", r"\1", line, flags=re.I) | |
| if any(re.search(p, normalized, flags=re.I) for p in skip_patterns): | |
| continue | |
| words, new_line = line.split(), [] | |
| i = 0 | |
| while i < len(words): | |
| # ββ detect runs of tripled-letter words ββ | |
| if is_entirely_tripled_letters(words[i]): | |
| j = i | |
| while j < len(words) and is_entirely_tripled_letters(words[j]): | |
| j += 1 | |
| run_len = j - i | |
| if run_len >= 5: # β₯3 β assume SOT header β DROP | |
| i = j | |
| continue | |
| else: # 1- or 2-word bold span β keep, dedup | |
| for k in range(i, j): | |
| new_line.append(dedup(words[k])) | |
| i = j | |
| continue | |
| # ββ normal per-word cleanup ββ | |
| w = words[i] | |
| if is_artifact_word(w): | |
| new_line.append(w[0]) | |
| elif looks_fully_repeated(w): | |
| new_line.append(dedup(w)) | |
| else: | |
| new_line.append(w) | |
| i += 1 | |
| final = " ".join(new_line).strip() | |
| if final: | |
| cleaned.append(final) | |
| # Remove stray brackets and blank lines | |
| cleaned_no_brackets = [] | |
| for ln in cleaned: | |
| ln = ln.replace("[", "").replace("]", "").strip() | |
| if ln: | |
| cleaned_no_brackets.append(ln) | |
| return "\n".join(cleaned_no_brackets) | |
| # === Final cleanup === | |
| def apply_textpy_cleanup(text): | |
| patterns_to_skip = [ | |
| r"teaser_", r"cold_open", r"\[anchor\]", r"\b\d{4}\s+\d{1,2}(?:am|pm)\b", | |
| r"eve break", r"day break", r"\[\[\[\s*stef", r"\[\[\[\s*\]\]\]", r"goodbye", | |
| r"teaser", r"\[cg\]", r"day ending", r"\[\[\[\s*day ending", | |
| r"commercial_", r"anchor\s*\[\s*close", | |
| # NEW PHRASES TO CUT IF LINE CONTAINS THEM | |
| r"morning show", r"mmoorriinngg\s+sshhooww", | |
| r"fade_to_black", r"cut_to_black", r"db_coming_back_wipe", | |
| r"\(second hour starts now\)", r"hello_intro-cold-open", | |
| r"live-that guy's name-press", r"live-that guy's name-press-round up", | |
| r"2box_lb", r"eve wipe from black", r"ntd_promo", r"ntd eve", | |
| r"eve_ending", r"======graveyard======", r"======templates======", | |
| r"black_end_don't_delete", r"ddaayy bbrreeaakk", r"aanncchhoorr", | |
| r"eevvee bbrreeaakk", r"aanncchhoorr cclloossee uupp", | |
| r"ddaayy eennddiinngg", r"open stef from black", | |
| r"stef segment begins", r"stef segment ends", | |
| r"welcome to n-t-d newsroom. i'm", r"nsr 1s intro wp", r"anchor", | |
| r"liveu_a_2box-b-roll", r"liveu_a_sot", r"comercial", | |
| r"nsr wp from black", r"cc0011 w open stef from black", | |
| r"stef w segment begins", r"stef w segment ends" | |
| ] | |
| cleaned = [ln for ln in text.splitlines() | |
| if not any(re.search(p, ln.lower()) for p in patterns_to_skip)] | |
| return "\n".join(cleaned) | |
| # === Glue function === | |
| def pdf_to_final_cleaned_text(pdf_path): | |
| raw_text = extract_text(pdf_path) | |
| stage1 = clean_text(raw_text) | |
| return apply_textpy_cleanup(stage1) | |