Spaces:

webmuppetnz
/

hmc-rag

Running

hmc-rag / scripts /clean_artifacts.py

webmuppet

Initial commit — health marketing compliance RAG

bad8b6c 5 days ago

8.83 kB

	"""
	Strip PDF/HTML extraction artefacts from corpus markdown.

	Three classes of noise are addressed:

	1. Hard rules — always stripped:
	- Control characters (form-feeds, NULs, etc. that escape from PDF parsing)
	- Standalone page numbers (`^\\s\\d+\\s$`)
	- Page header/footer patterns (`X \| 37`, `X \| P a g e`)
	- Known UI chrome (Material Design icon labels, TOC nav arrows, video transcript markers)

	2. Auto-detected boilerplate — stripped with a high-confidence threshold:
	- Any line of 12–100 chars that appears more than 10 times in a single document
	- Skips lines that look like markdown headings, metadata key:value lines, or list markers
	- Catches "Health Information Privacy Code 2020" repeating 27× as a page header

	3. Cosmetic cleanup — collapse runs of 3+ blank lines to a paragraph break.

	Returns (cleaned_text, stats_dict) so the caller can log what was removed.

	Used by all `build_*_compilation.py` scripts via `from clean_artifacts import clean_corpus_artifacts`.
	"""

	from __future__ import annotations

	import re
	from collections import Counter

	# ---- Hard-coded chrome patterns -----------------------------------

	# Each pattern is matched as a whole-line fullmatch (case-insensitive)
	# against the stripped line content. If any pattern matches, the line is dropped.
	_CHROME_PATTERNS = [
	# Standalone page number
	r"\d+",

	# Material Design icon labels (text content of <i class="material-icons"> tags).
	# Markitdown sometimes renders underscores as `\_` (markdown-escaped) — match either.
	r"expand\\?_(more\|less)",
	r"chevron\\?_(right\|left)",
	r"arrow\\?_(forward\|back\|upward\|downward\|drop\\?_down\|drop\\?_up)",
	r"menu",
	r"close",
	r"search",
	r"add\|remove",
	r"more\\?_(vert\|horiz)",

	# Video transcript markup (HDC and other agency video pages)
	r"#{1,6}\s*Visual",
	r"#{1,6}\s*Transcript",

	# Page header/footer patterns
	r".{1,80}\\|\s*\d+", # "X \| 37"
	r".{1,80}\\|\sP\sa\sg\se", # "X \| P a g e"

	# Navigation chrome
	r"[‹›«»<>]+\sBack(?:\sto\s*contents)?", # "‹ Back to contents"
	r"[‹›«»<>]+\s(Previous\|Next)(?:\spage)?",
	r"Back\sto\stop",
	r"Continued\s*overleaf",
	r"Continued\son\snext\s*page",
	r"Skip\sto\s(main\s*content\|content)",
	]

	_CHROME_REGEX = re.compile(
	"\|".join(f"(?:{p})" for p in _CHROME_PATTERNS),
	re.IGNORECASE,
	)

	# List-prefixed chrome: Material Design icon labels rendered as markdown
	# list items, e.g. `+ chevron\_left`, `- expand_more`. These survive the
	# main `_CHROME_REGEX` because that regex matches the whole stripped line,
	# and `clean_artifacts` deliberately skips list-prefixed lines for safety
	# (so it doesn't strip legitimate sub-bullets in legislation). This catalog
	# explicitly handles the case where the list prefix decorates known chrome.
	_LIST_CHROME_PATTERNS = [
	r"[*+\-]\s+expand\\?_(more\|less)",
	r"[*+\-]\s+chevron\\?_(right\|left)",
	r"[*+\-]\s+arrow\\?_(forward\|back\|upward\|downward\|drop\\?_down\|drop\\?_up)",
	r"[*+\-]\s+(menu\|close\|search)",
	r"[*+\-]\s+more\\?_(vert\|horiz)",
	r"[*+\-]\s+(add\|remove)",
	]

	_LIST_CHROME_REGEX = re.compile(
	"\|".join(f"(?:{p})" for p in _LIST_CHROME_PATTERNS),
	re.IGNORECASE,
	)

	# Control characters: everything < 0x20 except tab (\x09), newline (\x0a), CR (\x0d)
	_CONTROL_CHAR_REGEX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")


	def clean_corpus_artifacts(
	text: str,
	repetition_threshold: int = 10,
	min_repeated_len: int = 12,
	max_repeated_len: int = 100,
	) -> tuple[str, dict]:
	"""Strip page artefacts, UI chrome, control characters, and repeated boilerplate.

	Args:
	text: raw markdown extracted from PDF/HTML
	repetition_threshold: how many times a line must repeat to count as boilerplate
	min_repeated_len: shortest line considered for boilerplate detection (avoids
	stripping legitimate short content like "(1)" or section markers)
	max_repeated_len: longest line considered for boilerplate detection (avoids
	accidentally stripping repeated-prose paragraphs)

	Returns:
	(cleaned_text, stats_dict)
	"""
	stats = {
	"input_chars": len(text),
	"control_chars": 0,
	"page_numbers": 0,
	"page_headers": 0,
	"chrome_lines": 0,
	"boilerplate_lines": 0,
	"blank_runs_collapsed": 0,
	"boilerplate_strings": [],
	}

	# 1. Strip control characters
	cleaned = _CONTROL_CHAR_REGEX.sub("", text)
	stats["control_chars"] = len(text) - len(cleaned)

	# 2. Detect repeated boilerplate via frequency analysis
	lines = cleaned.split("\n")
	line_counter = Counter(l.strip() for l in lines if l.strip())
	boilerplate = set()
	for line, count in line_counter.items():
	if count < repetition_threshold:
	continue
	if not (min_repeated_len <= len(line) <= max_repeated_len):
	continue
	# Word-count guard: chrome typically has 3+ words ("Page 37 \| Title", "Back
	# to contents", "Health Information Privacy Code 2020"). Sentence-opener
	# fragments like "Advertisements must" or "Practitioners should" are 1-2
	# words and would falsely match if repeated across many rules.
	if len(line.split()) < 3:
	continue
	# Skip likely-legitimate content patterns
	if line.startswith("#"): # markdown heading
	continue
	if re.match(r"^[a-z_]+:\s+\S", line): # metadata "key: value"
	continue
	if line.startswith("Source:"): # URL source marker
	continue
	if line.startswith(("- ", "* ", "+ ")): # list items
	continue
	if line.startswith(("(", "[")) and len(line) < 30: # subsection markers like "(1) blah"
	continue
	# Skip lines that look like the start of a sentence wrapped over a line
	# boundary: ends with no terminal punctuation AND no pipe-style header marker.
	# This catches the "Advertisements must" / "Practitioners should" case
	# without false-flagging legitimate page headers (which usually contain a `\|`
	# or a year/number, both surviving this check).
	if not re.search(r"[.!?:]$\|\\|\s\d\|\d{4}\s$\|[‹›«»>]", line):
	continue
	boilerplate.add(line)

	stats["boilerplate_strings"] = sorted(boilerplate)

	# 3. Strip chrome + boilerplate lines line-by-line
	out = []
	for line in lines:
	stripped = line.strip()

	# Always keep blank lines (collapse them in step 4)
	if not stripped:
	out.append(line)
	continue

	# Boilerplate?
	if stripped in boilerplate:
	stats["boilerplate_lines"] += 1
	continue

	# Standalone page number?
	if re.fullmatch(r"\d+", stripped):
	stats["page_numbers"] += 1
	continue

	# Page header/footer with pipe-number pattern?
	if re.fullmatch(r".{1,80}\\|\s(?:\d+\|P\sa\sg\se)\s*", stripped, re.IGNORECASE):
	stats["page_headers"] += 1
	continue

	# Other chrome?
	if _CHROME_REGEX.fullmatch(stripped):
	stats["chrome_lines"] += 1
	continue

	# List-prefixed chrome (Material Design icons rendered as markdown list items)?
	if _LIST_CHROME_REGEX.fullmatch(stripped):
	stats["chrome_lines"] += 1
	continue

	out.append(line)

	cleaned = "\n".join(out)

	# 4. Collapse 3+ consecutive blank lines to a single paragraph break
	blank_run_matches = re.findall(r"\n{3,}", cleaned)
	stats["blank_runs_collapsed"] = len(blank_run_matches)
	cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)

	stats["output_chars"] = len(cleaned)
	stats["chars_removed"] = stats["input_chars"] - stats["output_chars"]
	return cleaned, stats


	def format_stats(stats: dict, label: str = "") -> str:
	"""Format a stats dict into a single line for build-script logging."""
	bits = []
	for key in ("page_numbers", "page_headers", "chrome_lines",
	"boilerplate_lines", "control_chars", "blank_runs_collapsed"):
	val = stats.get(key, 0)
	if val:
	bits.append(f"{val} {key.replace('_', '-')}")
	if not bits:
	return f" ✂ Cleaned: nothing to remove ({label})" if label else " ✂ Cleaned: nothing to remove"
	summary = ", ".join(bits)
	chars = stats.get("chars_removed", 0)
	pct = 100 * chars / stats["input_chars"] if stats.get("input_chars") else 0
	prefix = f" ✂ Cleaned ({label}): " if label else " ✂ Cleaned: "
	return f"{prefix}{summary} → -{chars:,} chars (-{pct:.1f}%)"