""" Strip PDF/HTML extraction artefacts from corpus markdown. Three classes of noise are addressed: 1. **Hard rules** — always stripped: - Control characters (form-feeds, NULs, etc. that escape from PDF parsing) - Standalone page numbers (`^\\s*\\d+\\s*$`) - Page header/footer patterns (`X | 37`, `X | P a g e`) - Known UI chrome (Material Design icon labels, TOC nav arrows, video transcript markers) 2. **Auto-detected boilerplate** — stripped with a high-confidence threshold: - Any line of 12–100 chars that appears more than 10 times in a single document - Skips lines that look like markdown headings, metadata key:value lines, or list markers - Catches "Health Information Privacy Code 2020" repeating 27× as a page header 3. **Cosmetic cleanup** — collapse runs of 3+ blank lines to a paragraph break. Returns (cleaned_text, stats_dict) so the caller can log what was removed. Used by all `build_*_compilation.py` scripts via `from clean_artifacts import clean_corpus_artifacts`. """ from __future__ import annotations import re from collections import Counter # ---- Hard-coded chrome patterns ----------------------------------- # Each pattern is matched as a *whole-line* fullmatch (case-insensitive) # against the stripped line content. If any pattern matches, the line is dropped. _CHROME_PATTERNS = [ # Standalone page number r"\d+", # Material Design icon labels (text content of tags). # Markitdown sometimes renders underscores as `\_` (markdown-escaped) — match either. r"expand\\?_(more|less)", r"chevron\\?_(right|left)", r"arrow\\?_(forward|back|upward|downward|drop\\?_down|drop\\?_up)", r"menu", r"close", r"search", r"add|remove", r"more\\?_(vert|horiz)", # Video transcript markup (HDC and other agency video pages) r"#{1,6}\s*Visual", r"#{1,6}\s*Transcript", # Page header/footer patterns r".{1,80}\|\s*\d+", # "X | 37" r".{1,80}\|\s*P\s*a\s*g\s*e", # "X | P a g e" # Navigation chrome r"[‹›«»<>]+\s*Back(?:\s*to\s*contents)?", # "‹ Back to contents" r"[‹›«»<>]+\s*(Previous|Next)(?:\s*page)?", r"Back\s*to\s*top", r"Continued\s*overleaf", r"Continued\s*on\s*next\s*page", r"Skip\s*to\s*(main\s*content|content)", ] _CHROME_REGEX = re.compile( "|".join(f"(?:{p})" for p in _CHROME_PATTERNS), re.IGNORECASE, ) # List-prefixed chrome: Material Design icon labels rendered as markdown # list items, e.g. `+ chevron\_left`, `- expand_more`. These survive the # main `_CHROME_REGEX` because that regex matches the whole stripped line, # and `clean_artifacts` deliberately skips list-prefixed lines for safety # (so it doesn't strip legitimate sub-bullets in legislation). This catalog # explicitly handles the case where the list prefix decorates known chrome. _LIST_CHROME_PATTERNS = [ r"[*+\-]\s+expand\\?_(more|less)", r"[*+\-]\s+chevron\\?_(right|left)", r"[*+\-]\s+arrow\\?_(forward|back|upward|downward|drop\\?_down|drop\\?_up)", r"[*+\-]\s+(menu|close|search)", r"[*+\-]\s+more\\?_(vert|horiz)", r"[*+\-]\s+(add|remove)", ] _LIST_CHROME_REGEX = re.compile( "|".join(f"(?:{p})" for p in _LIST_CHROME_PATTERNS), re.IGNORECASE, ) # Control characters: everything < 0x20 except tab (\x09), newline (\x0a), CR (\x0d) _CONTROL_CHAR_REGEX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]") def clean_corpus_artifacts( text: str, repetition_threshold: int = 10, min_repeated_len: int = 12, max_repeated_len: int = 100, ) -> tuple[str, dict]: """Strip page artefacts, UI chrome, control characters, and repeated boilerplate. Args: text: raw markdown extracted from PDF/HTML repetition_threshold: how many times a line must repeat to count as boilerplate min_repeated_len: shortest line considered for boilerplate detection (avoids stripping legitimate short content like "(1)" or section markers) max_repeated_len: longest line considered for boilerplate detection (avoids accidentally stripping repeated-prose paragraphs) Returns: (cleaned_text, stats_dict) """ stats = { "input_chars": len(text), "control_chars": 0, "page_numbers": 0, "page_headers": 0, "chrome_lines": 0, "boilerplate_lines": 0, "blank_runs_collapsed": 0, "boilerplate_strings": [], } # 1. Strip control characters cleaned = _CONTROL_CHAR_REGEX.sub("", text) stats["control_chars"] = len(text) - len(cleaned) # 2. Detect repeated boilerplate via frequency analysis lines = cleaned.split("\n") line_counter = Counter(l.strip() for l in lines if l.strip()) boilerplate = set() for line, count in line_counter.items(): if count < repetition_threshold: continue if not (min_repeated_len <= len(line) <= max_repeated_len): continue # Word-count guard: chrome typically has 3+ words ("Page 37 | Title", "Back # to contents", "Health Information Privacy Code 2020"). Sentence-opener # fragments like "Advertisements must" or "Practitioners should" are 1-2 # words and would falsely match if repeated across many rules. if len(line.split()) < 3: continue # Skip likely-legitimate content patterns if line.startswith("#"): # markdown heading continue if re.match(r"^[a-z_]+:\s+\S", line): # metadata "key: value" continue if line.startswith("Source:"): # URL source marker continue if line.startswith(("- ", "* ", "+ ")): # list items continue if line.startswith(("(", "[")) and len(line) < 30: # subsection markers like "(1) blah" continue # Skip lines that look like the start of a sentence wrapped over a line # boundary: ends with no terminal punctuation AND no pipe-style header marker. # This catches the "Advertisements must" / "Practitioners should" case # without false-flagging legitimate page headers (which usually contain a `|` # or a year/number, both surviving this check). if not re.search(r"[.!?:]$|\|\s*\d|\d{4}\s*$|[‹›«»>]", line): continue boilerplate.add(line) stats["boilerplate_strings"] = sorted(boilerplate) # 3. Strip chrome + boilerplate lines line-by-line out = [] for line in lines: stripped = line.strip() # Always keep blank lines (collapse them in step 4) if not stripped: out.append(line) continue # Boilerplate? if stripped in boilerplate: stats["boilerplate_lines"] += 1 continue # Standalone page number? if re.fullmatch(r"\d+", stripped): stats["page_numbers"] += 1 continue # Page header/footer with pipe-number pattern? if re.fullmatch(r".{1,80}\|\s*(?:\d+|P\s*a\s*g\s*e)\s*", stripped, re.IGNORECASE): stats["page_headers"] += 1 continue # Other chrome? if _CHROME_REGEX.fullmatch(stripped): stats["chrome_lines"] += 1 continue # List-prefixed chrome (Material Design icons rendered as markdown list items)? if _LIST_CHROME_REGEX.fullmatch(stripped): stats["chrome_lines"] += 1 continue out.append(line) cleaned = "\n".join(out) # 4. Collapse 3+ consecutive blank lines to a single paragraph break blank_run_matches = re.findall(r"\n{3,}", cleaned) stats["blank_runs_collapsed"] = len(blank_run_matches) cleaned = re.sub(r"\n{3,}", "\n\n", cleaned) stats["output_chars"] = len(cleaned) stats["chars_removed"] = stats["input_chars"] - stats["output_chars"] return cleaned, stats def format_stats(stats: dict, label: str = "") -> str: """Format a stats dict into a single line for build-script logging.""" bits = [] for key in ("page_numbers", "page_headers", "chrome_lines", "boilerplate_lines", "control_chars", "blank_runs_collapsed"): val = stats.get(key, 0) if val: bits.append(f"{val} {key.replace('_', '-')}") if not bits: return f" ✂ Cleaned: nothing to remove ({label})" if label else " ✂ Cleaned: nothing to remove" summary = ", ".join(bits) chars = stats.get("chars_removed", 0) pct = 100 * chars / stats["input_chars"] if stats.get("input_chars") else 0 prefix = f" ✂ Cleaned ({label}): " if label else " ✂ Cleaned: " return f"{prefix}{summary} → -{chars:,} chars (-{pct:.1f}%)"