Spaces:
Running
Running
| """ | |
| Strip PDF/HTML extraction artefacts from corpus markdown. | |
| Three classes of noise are addressed: | |
| 1. **Hard rules** — always stripped: | |
| - Control characters (form-feeds, NULs, etc. that escape from PDF parsing) | |
| - Standalone page numbers (`^\\s*\\d+\\s*$`) | |
| - Page header/footer patterns (`X | 37`, `X | P a g e`) | |
| - Known UI chrome (Material Design icon labels, TOC nav arrows, video transcript markers) | |
| 2. **Auto-detected boilerplate** — stripped with a high-confidence threshold: | |
| - Any line of 12–100 chars that appears more than 10 times in a single document | |
| - Skips lines that look like markdown headings, metadata key:value lines, or list markers | |
| - Catches "Health Information Privacy Code 2020" repeating 27× as a page header | |
| 3. **Cosmetic cleanup** — collapse runs of 3+ blank lines to a paragraph break. | |
| Returns (cleaned_text, stats_dict) so the caller can log what was removed. | |
| Used by all `build_*_compilation.py` scripts via `from clean_artifacts import clean_corpus_artifacts`. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from collections import Counter | |
| # ---- Hard-coded chrome patterns ----------------------------------- | |
| # Each pattern is matched as a *whole-line* fullmatch (case-insensitive) | |
| # against the stripped line content. If any pattern matches, the line is dropped. | |
| _CHROME_PATTERNS = [ | |
| # Standalone page number | |
| r"\d+", | |
| # Material Design icon labels (text content of <i class="material-icons"> tags). | |
| # Markitdown sometimes renders underscores as `\_` (markdown-escaped) — match either. | |
| r"expand\\?_(more|less)", | |
| r"chevron\\?_(right|left)", | |
| r"arrow\\?_(forward|back|upward|downward|drop\\?_down|drop\\?_up)", | |
| r"menu", | |
| r"close", | |
| r"search", | |
| r"add|remove", | |
| r"more\\?_(vert|horiz)", | |
| # Video transcript markup (HDC and other agency video pages) | |
| r"#{1,6}\s*Visual", | |
| r"#{1,6}\s*Transcript", | |
| # Page header/footer patterns | |
| r".{1,80}\|\s*\d+", # "X | 37" | |
| r".{1,80}\|\s*P\s*a\s*g\s*e", # "X | P a g e" | |
| # Navigation chrome | |
| r"[‹›«»<>]+\s*Back(?:\s*to\s*contents)?", # "‹ Back to contents" | |
| r"[‹›«»<>]+\s*(Previous|Next)(?:\s*page)?", | |
| r"Back\s*to\s*top", | |
| r"Continued\s*overleaf", | |
| r"Continued\s*on\s*next\s*page", | |
| r"Skip\s*to\s*(main\s*content|content)", | |
| ] | |
| _CHROME_REGEX = re.compile( | |
| "|".join(f"(?:{p})" for p in _CHROME_PATTERNS), | |
| re.IGNORECASE, | |
| ) | |
| # List-prefixed chrome: Material Design icon labels rendered as markdown | |
| # list items, e.g. `+ chevron\_left`, `- expand_more`. These survive the | |
| # main `_CHROME_REGEX` because that regex matches the whole stripped line, | |
| # and `clean_artifacts` deliberately skips list-prefixed lines for safety | |
| # (so it doesn't strip legitimate sub-bullets in legislation). This catalog | |
| # explicitly handles the case where the list prefix decorates known chrome. | |
| _LIST_CHROME_PATTERNS = [ | |
| r"[*+\-]\s+expand\\?_(more|less)", | |
| r"[*+\-]\s+chevron\\?_(right|left)", | |
| r"[*+\-]\s+arrow\\?_(forward|back|upward|downward|drop\\?_down|drop\\?_up)", | |
| r"[*+\-]\s+(menu|close|search)", | |
| r"[*+\-]\s+more\\?_(vert|horiz)", | |
| r"[*+\-]\s+(add|remove)", | |
| ] | |
| _LIST_CHROME_REGEX = re.compile( | |
| "|".join(f"(?:{p})" for p in _LIST_CHROME_PATTERNS), | |
| re.IGNORECASE, | |
| ) | |
| # Control characters: everything < 0x20 except tab (\x09), newline (\x0a), CR (\x0d) | |
| _CONTROL_CHAR_REGEX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]") | |
| def clean_corpus_artifacts( | |
| text: str, | |
| repetition_threshold: int = 10, | |
| min_repeated_len: int = 12, | |
| max_repeated_len: int = 100, | |
| ) -> tuple[str, dict]: | |
| """Strip page artefacts, UI chrome, control characters, and repeated boilerplate. | |
| Args: | |
| text: raw markdown extracted from PDF/HTML | |
| repetition_threshold: how many times a line must repeat to count as boilerplate | |
| min_repeated_len: shortest line considered for boilerplate detection (avoids | |
| stripping legitimate short content like "(1)" or section markers) | |
| max_repeated_len: longest line considered for boilerplate detection (avoids | |
| accidentally stripping repeated-prose paragraphs) | |
| Returns: | |
| (cleaned_text, stats_dict) | |
| """ | |
| stats = { | |
| "input_chars": len(text), | |
| "control_chars": 0, | |
| "page_numbers": 0, | |
| "page_headers": 0, | |
| "chrome_lines": 0, | |
| "boilerplate_lines": 0, | |
| "blank_runs_collapsed": 0, | |
| "boilerplate_strings": [], | |
| } | |
| # 1. Strip control characters | |
| cleaned = _CONTROL_CHAR_REGEX.sub("", text) | |
| stats["control_chars"] = len(text) - len(cleaned) | |
| # 2. Detect repeated boilerplate via frequency analysis | |
| lines = cleaned.split("\n") | |
| line_counter = Counter(l.strip() for l in lines if l.strip()) | |
| boilerplate = set() | |
| for line, count in line_counter.items(): | |
| if count < repetition_threshold: | |
| continue | |
| if not (min_repeated_len <= len(line) <= max_repeated_len): | |
| continue | |
| # Word-count guard: chrome typically has 3+ words ("Page 37 | Title", "Back | |
| # to contents", "Health Information Privacy Code 2020"). Sentence-opener | |
| # fragments like "Advertisements must" or "Practitioners should" are 1-2 | |
| # words and would falsely match if repeated across many rules. | |
| if len(line.split()) < 3: | |
| continue | |
| # Skip likely-legitimate content patterns | |
| if line.startswith("#"): # markdown heading | |
| continue | |
| if re.match(r"^[a-z_]+:\s+\S", line): # metadata "key: value" | |
| continue | |
| if line.startswith("Source:"): # URL source marker | |
| continue | |
| if line.startswith(("- ", "* ", "+ ")): # list items | |
| continue | |
| if line.startswith(("(", "[")) and len(line) < 30: # subsection markers like "(1) blah" | |
| continue | |
| # Skip lines that look like the start of a sentence wrapped over a line | |
| # boundary: ends with no terminal punctuation AND no pipe-style header marker. | |
| # This catches the "Advertisements must" / "Practitioners should" case | |
| # without false-flagging legitimate page headers (which usually contain a `|` | |
| # or a year/number, both surviving this check). | |
| if not re.search(r"[.!?:]$|\|\s*\d|\d{4}\s*$|[‹›«»>]", line): | |
| continue | |
| boilerplate.add(line) | |
| stats["boilerplate_strings"] = sorted(boilerplate) | |
| # 3. Strip chrome + boilerplate lines line-by-line | |
| out = [] | |
| for line in lines: | |
| stripped = line.strip() | |
| # Always keep blank lines (collapse them in step 4) | |
| if not stripped: | |
| out.append(line) | |
| continue | |
| # Boilerplate? | |
| if stripped in boilerplate: | |
| stats["boilerplate_lines"] += 1 | |
| continue | |
| # Standalone page number? | |
| if re.fullmatch(r"\d+", stripped): | |
| stats["page_numbers"] += 1 | |
| continue | |
| # Page header/footer with pipe-number pattern? | |
| if re.fullmatch(r".{1,80}\|\s*(?:\d+|P\s*a\s*g\s*e)\s*", stripped, re.IGNORECASE): | |
| stats["page_headers"] += 1 | |
| continue | |
| # Other chrome? | |
| if _CHROME_REGEX.fullmatch(stripped): | |
| stats["chrome_lines"] += 1 | |
| continue | |
| # List-prefixed chrome (Material Design icons rendered as markdown list items)? | |
| if _LIST_CHROME_REGEX.fullmatch(stripped): | |
| stats["chrome_lines"] += 1 | |
| continue | |
| out.append(line) | |
| cleaned = "\n".join(out) | |
| # 4. Collapse 3+ consecutive blank lines to a single paragraph break | |
| blank_run_matches = re.findall(r"\n{3,}", cleaned) | |
| stats["blank_runs_collapsed"] = len(blank_run_matches) | |
| cleaned = re.sub(r"\n{3,}", "\n\n", cleaned) | |
| stats["output_chars"] = len(cleaned) | |
| stats["chars_removed"] = stats["input_chars"] - stats["output_chars"] | |
| return cleaned, stats | |
| def format_stats(stats: dict, label: str = "") -> str: | |
| """Format a stats dict into a single line for build-script logging.""" | |
| bits = [] | |
| for key in ("page_numbers", "page_headers", "chrome_lines", | |
| "boilerplate_lines", "control_chars", "blank_runs_collapsed"): | |
| val = stats.get(key, 0) | |
| if val: | |
| bits.append(f"{val} {key.replace('_', '-')}") | |
| if not bits: | |
| return f" ✂ Cleaned: nothing to remove ({label})" if label else " ✂ Cleaned: nothing to remove" | |
| summary = ", ".join(bits) | |
| chars = stats.get("chars_removed", 0) | |
| pct = 100 * chars / stats["input_chars"] if stats.get("input_chars") else 0 | |
| prefix = f" ✂ Cleaned ({label}): " if label else " ✂ Cleaned: " | |
| return f"{prefix}{summary} → -{chars:,} chars (-{pct:.1f}%)" | |