hmc-rag / scripts /clean_artifacts.py
webmuppet
Initial commit — health marketing compliance RAG
bad8b6c
"""
Strip PDF/HTML extraction artefacts from corpus markdown.
Three classes of noise are addressed:
1. **Hard rules** — always stripped:
- Control characters (form-feeds, NULs, etc. that escape from PDF parsing)
- Standalone page numbers (`^\\s*\\d+\\s*$`)
- Page header/footer patterns (`X | 37`, `X | P a g e`)
- Known UI chrome (Material Design icon labels, TOC nav arrows, video transcript markers)
2. **Auto-detected boilerplate** — stripped with a high-confidence threshold:
- Any line of 12–100 chars that appears more than 10 times in a single document
- Skips lines that look like markdown headings, metadata key:value lines, or list markers
- Catches "Health Information Privacy Code 2020" repeating 27× as a page header
3. **Cosmetic cleanup** — collapse runs of 3+ blank lines to a paragraph break.
Returns (cleaned_text, stats_dict) so the caller can log what was removed.
Used by all `build_*_compilation.py` scripts via `from clean_artifacts import clean_corpus_artifacts`.
"""
from __future__ import annotations
import re
from collections import Counter
# ---- Hard-coded chrome patterns -----------------------------------
# Each pattern is matched as a *whole-line* fullmatch (case-insensitive)
# against the stripped line content. If any pattern matches, the line is dropped.
_CHROME_PATTERNS = [
# Standalone page number
r"\d+",
# Material Design icon labels (text content of <i class="material-icons"> tags).
# Markitdown sometimes renders underscores as `\_` (markdown-escaped) — match either.
r"expand\\?_(more|less)",
r"chevron\\?_(right|left)",
r"arrow\\?_(forward|back|upward|downward|drop\\?_down|drop\\?_up)",
r"menu",
r"close",
r"search",
r"add|remove",
r"more\\?_(vert|horiz)",
# Video transcript markup (HDC and other agency video pages)
r"#{1,6}\s*Visual",
r"#{1,6}\s*Transcript",
# Page header/footer patterns
r".{1,80}\|\s*\d+", # "X | 37"
r".{1,80}\|\s*P\s*a\s*g\s*e", # "X | P a g e"
# Navigation chrome
r"[‹›«»<>]+\s*Back(?:\s*to\s*contents)?", # "‹ Back to contents"
r"[‹›«»<>]+\s*(Previous|Next)(?:\s*page)?",
r"Back\s*to\s*top",
r"Continued\s*overleaf",
r"Continued\s*on\s*next\s*page",
r"Skip\s*to\s*(main\s*content|content)",
]
_CHROME_REGEX = re.compile(
"|".join(f"(?:{p})" for p in _CHROME_PATTERNS),
re.IGNORECASE,
)
# List-prefixed chrome: Material Design icon labels rendered as markdown
# list items, e.g. `+ chevron\_left`, `- expand_more`. These survive the
# main `_CHROME_REGEX` because that regex matches the whole stripped line,
# and `clean_artifacts` deliberately skips list-prefixed lines for safety
# (so it doesn't strip legitimate sub-bullets in legislation). This catalog
# explicitly handles the case where the list prefix decorates known chrome.
_LIST_CHROME_PATTERNS = [
r"[*+\-]\s+expand\\?_(more|less)",
r"[*+\-]\s+chevron\\?_(right|left)",
r"[*+\-]\s+arrow\\?_(forward|back|upward|downward|drop\\?_down|drop\\?_up)",
r"[*+\-]\s+(menu|close|search)",
r"[*+\-]\s+more\\?_(vert|horiz)",
r"[*+\-]\s+(add|remove)",
]
_LIST_CHROME_REGEX = re.compile(
"|".join(f"(?:{p})" for p in _LIST_CHROME_PATTERNS),
re.IGNORECASE,
)
# Control characters: everything < 0x20 except tab (\x09), newline (\x0a), CR (\x0d)
_CONTROL_CHAR_REGEX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
def clean_corpus_artifacts(
text: str,
repetition_threshold: int = 10,
min_repeated_len: int = 12,
max_repeated_len: int = 100,
) -> tuple[str, dict]:
"""Strip page artefacts, UI chrome, control characters, and repeated boilerplate.
Args:
text: raw markdown extracted from PDF/HTML
repetition_threshold: how many times a line must repeat to count as boilerplate
min_repeated_len: shortest line considered for boilerplate detection (avoids
stripping legitimate short content like "(1)" or section markers)
max_repeated_len: longest line considered for boilerplate detection (avoids
accidentally stripping repeated-prose paragraphs)
Returns:
(cleaned_text, stats_dict)
"""
stats = {
"input_chars": len(text),
"control_chars": 0,
"page_numbers": 0,
"page_headers": 0,
"chrome_lines": 0,
"boilerplate_lines": 0,
"blank_runs_collapsed": 0,
"boilerplate_strings": [],
}
# 1. Strip control characters
cleaned = _CONTROL_CHAR_REGEX.sub("", text)
stats["control_chars"] = len(text) - len(cleaned)
# 2. Detect repeated boilerplate via frequency analysis
lines = cleaned.split("\n")
line_counter = Counter(l.strip() for l in lines if l.strip())
boilerplate = set()
for line, count in line_counter.items():
if count < repetition_threshold:
continue
if not (min_repeated_len <= len(line) <= max_repeated_len):
continue
# Word-count guard: chrome typically has 3+ words ("Page 37 | Title", "Back
# to contents", "Health Information Privacy Code 2020"). Sentence-opener
# fragments like "Advertisements must" or "Practitioners should" are 1-2
# words and would falsely match if repeated across many rules.
if len(line.split()) < 3:
continue
# Skip likely-legitimate content patterns
if line.startswith("#"): # markdown heading
continue
if re.match(r"^[a-z_]+:\s+\S", line): # metadata "key: value"
continue
if line.startswith("Source:"): # URL source marker
continue
if line.startswith(("- ", "* ", "+ ")): # list items
continue
if line.startswith(("(", "[")) and len(line) < 30: # subsection markers like "(1) blah"
continue
# Skip lines that look like the start of a sentence wrapped over a line
# boundary: ends with no terminal punctuation AND no pipe-style header marker.
# This catches the "Advertisements must" / "Practitioners should" case
# without false-flagging legitimate page headers (which usually contain a `|`
# or a year/number, both surviving this check).
if not re.search(r"[.!?:]$|\|\s*\d|\d{4}\s*$|[‹›«»>]", line):
continue
boilerplate.add(line)
stats["boilerplate_strings"] = sorted(boilerplate)
# 3. Strip chrome + boilerplate lines line-by-line
out = []
for line in lines:
stripped = line.strip()
# Always keep blank lines (collapse them in step 4)
if not stripped:
out.append(line)
continue
# Boilerplate?
if stripped in boilerplate:
stats["boilerplate_lines"] += 1
continue
# Standalone page number?
if re.fullmatch(r"\d+", stripped):
stats["page_numbers"] += 1
continue
# Page header/footer with pipe-number pattern?
if re.fullmatch(r".{1,80}\|\s*(?:\d+|P\s*a\s*g\s*e)\s*", stripped, re.IGNORECASE):
stats["page_headers"] += 1
continue
# Other chrome?
if _CHROME_REGEX.fullmatch(stripped):
stats["chrome_lines"] += 1
continue
# List-prefixed chrome (Material Design icons rendered as markdown list items)?
if _LIST_CHROME_REGEX.fullmatch(stripped):
stats["chrome_lines"] += 1
continue
out.append(line)
cleaned = "\n".join(out)
# 4. Collapse 3+ consecutive blank lines to a single paragraph break
blank_run_matches = re.findall(r"\n{3,}", cleaned)
stats["blank_runs_collapsed"] = len(blank_run_matches)
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
stats["output_chars"] = len(cleaned)
stats["chars_removed"] = stats["input_chars"] - stats["output_chars"]
return cleaned, stats
def format_stats(stats: dict, label: str = "") -> str:
"""Format a stats dict into a single line for build-script logging."""
bits = []
for key in ("page_numbers", "page_headers", "chrome_lines",
"boilerplate_lines", "control_chars", "blank_runs_collapsed"):
val = stats.get(key, 0)
if val:
bits.append(f"{val} {key.replace('_', '-')}")
if not bits:
return f" ✂ Cleaned: nothing to remove ({label})" if label else " ✂ Cleaned: nothing to remove"
summary = ", ".join(bits)
chars = stats.get("chars_removed", 0)
pct = 100 * chars / stats["input_chars"] if stats.get("input_chars") else 0
prefix = f" ✂ Cleaned ({label}): " if label else " ✂ Cleaned: "
return f"{prefix}{summary} → -{chars:,} chars (-{pct:.1f}%)"