Spaces:
Running
Running
File size: 8,832 Bytes
bad8b6c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | """
Strip PDF/HTML extraction artefacts from corpus markdown.
Three classes of noise are addressed:
1. **Hard rules** — always stripped:
- Control characters (form-feeds, NULs, etc. that escape from PDF parsing)
- Standalone page numbers (`^\\s*\\d+\\s*$`)
- Page header/footer patterns (`X | 37`, `X | P a g e`)
- Known UI chrome (Material Design icon labels, TOC nav arrows, video transcript markers)
2. **Auto-detected boilerplate** — stripped with a high-confidence threshold:
- Any line of 12–100 chars that appears more than 10 times in a single document
- Skips lines that look like markdown headings, metadata key:value lines, or list markers
- Catches "Health Information Privacy Code 2020" repeating 27× as a page header
3. **Cosmetic cleanup** — collapse runs of 3+ blank lines to a paragraph break.
Returns (cleaned_text, stats_dict) so the caller can log what was removed.
Used by all `build_*_compilation.py` scripts via `from clean_artifacts import clean_corpus_artifacts`.
"""
from __future__ import annotations
import re
from collections import Counter
# ---- Hard-coded chrome patterns -----------------------------------
# Each pattern is matched as a *whole-line* fullmatch (case-insensitive)
# against the stripped line content. If any pattern matches, the line is dropped.
_CHROME_PATTERNS = [
# Standalone page number
r"\d+",
# Material Design icon labels (text content of <i class="material-icons"> tags).
# Markitdown sometimes renders underscores as `\_` (markdown-escaped) — match either.
r"expand\\?_(more|less)",
r"chevron\\?_(right|left)",
r"arrow\\?_(forward|back|upward|downward|drop\\?_down|drop\\?_up)",
r"menu",
r"close",
r"search",
r"add|remove",
r"more\\?_(vert|horiz)",
# Video transcript markup (HDC and other agency video pages)
r"#{1,6}\s*Visual",
r"#{1,6}\s*Transcript",
# Page header/footer patterns
r".{1,80}\|\s*\d+", # "X | 37"
r".{1,80}\|\s*P\s*a\s*g\s*e", # "X | P a g e"
# Navigation chrome
r"[‹›«»<>]+\s*Back(?:\s*to\s*contents)?", # "‹ Back to contents"
r"[‹›«»<>]+\s*(Previous|Next)(?:\s*page)?",
r"Back\s*to\s*top",
r"Continued\s*overleaf",
r"Continued\s*on\s*next\s*page",
r"Skip\s*to\s*(main\s*content|content)",
]
_CHROME_REGEX = re.compile(
"|".join(f"(?:{p})" for p in _CHROME_PATTERNS),
re.IGNORECASE,
)
# List-prefixed chrome: Material Design icon labels rendered as markdown
# list items, e.g. `+ chevron\_left`, `- expand_more`. These survive the
# main `_CHROME_REGEX` because that regex matches the whole stripped line,
# and `clean_artifacts` deliberately skips list-prefixed lines for safety
# (so it doesn't strip legitimate sub-bullets in legislation). This catalog
# explicitly handles the case where the list prefix decorates known chrome.
_LIST_CHROME_PATTERNS = [
r"[*+\-]\s+expand\\?_(more|less)",
r"[*+\-]\s+chevron\\?_(right|left)",
r"[*+\-]\s+arrow\\?_(forward|back|upward|downward|drop\\?_down|drop\\?_up)",
r"[*+\-]\s+(menu|close|search)",
r"[*+\-]\s+more\\?_(vert|horiz)",
r"[*+\-]\s+(add|remove)",
]
_LIST_CHROME_REGEX = re.compile(
"|".join(f"(?:{p})" for p in _LIST_CHROME_PATTERNS),
re.IGNORECASE,
)
# Control characters: everything < 0x20 except tab (\x09), newline (\x0a), CR (\x0d)
_CONTROL_CHAR_REGEX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
def clean_corpus_artifacts(
text: str,
repetition_threshold: int = 10,
min_repeated_len: int = 12,
max_repeated_len: int = 100,
) -> tuple[str, dict]:
"""Strip page artefacts, UI chrome, control characters, and repeated boilerplate.
Args:
text: raw markdown extracted from PDF/HTML
repetition_threshold: how many times a line must repeat to count as boilerplate
min_repeated_len: shortest line considered for boilerplate detection (avoids
stripping legitimate short content like "(1)" or section markers)
max_repeated_len: longest line considered for boilerplate detection (avoids
accidentally stripping repeated-prose paragraphs)
Returns:
(cleaned_text, stats_dict)
"""
stats = {
"input_chars": len(text),
"control_chars": 0,
"page_numbers": 0,
"page_headers": 0,
"chrome_lines": 0,
"boilerplate_lines": 0,
"blank_runs_collapsed": 0,
"boilerplate_strings": [],
}
# 1. Strip control characters
cleaned = _CONTROL_CHAR_REGEX.sub("", text)
stats["control_chars"] = len(text) - len(cleaned)
# 2. Detect repeated boilerplate via frequency analysis
lines = cleaned.split("\n")
line_counter = Counter(l.strip() for l in lines if l.strip())
boilerplate = set()
for line, count in line_counter.items():
if count < repetition_threshold:
continue
if not (min_repeated_len <= len(line) <= max_repeated_len):
continue
# Word-count guard: chrome typically has 3+ words ("Page 37 | Title", "Back
# to contents", "Health Information Privacy Code 2020"). Sentence-opener
# fragments like "Advertisements must" or "Practitioners should" are 1-2
# words and would falsely match if repeated across many rules.
if len(line.split()) < 3:
continue
# Skip likely-legitimate content patterns
if line.startswith("#"): # markdown heading
continue
if re.match(r"^[a-z_]+:\s+\S", line): # metadata "key: value"
continue
if line.startswith("Source:"): # URL source marker
continue
if line.startswith(("- ", "* ", "+ ")): # list items
continue
if line.startswith(("(", "[")) and len(line) < 30: # subsection markers like "(1) blah"
continue
# Skip lines that look like the start of a sentence wrapped over a line
# boundary: ends with no terminal punctuation AND no pipe-style header marker.
# This catches the "Advertisements must" / "Practitioners should" case
# without false-flagging legitimate page headers (which usually contain a `|`
# or a year/number, both surviving this check).
if not re.search(r"[.!?:]$|\|\s*\d|\d{4}\s*$|[‹›«»>]", line):
continue
boilerplate.add(line)
stats["boilerplate_strings"] = sorted(boilerplate)
# 3. Strip chrome + boilerplate lines line-by-line
out = []
for line in lines:
stripped = line.strip()
# Always keep blank lines (collapse them in step 4)
if not stripped:
out.append(line)
continue
# Boilerplate?
if stripped in boilerplate:
stats["boilerplate_lines"] += 1
continue
# Standalone page number?
if re.fullmatch(r"\d+", stripped):
stats["page_numbers"] += 1
continue
# Page header/footer with pipe-number pattern?
if re.fullmatch(r".{1,80}\|\s*(?:\d+|P\s*a\s*g\s*e)\s*", stripped, re.IGNORECASE):
stats["page_headers"] += 1
continue
# Other chrome?
if _CHROME_REGEX.fullmatch(stripped):
stats["chrome_lines"] += 1
continue
# List-prefixed chrome (Material Design icons rendered as markdown list items)?
if _LIST_CHROME_REGEX.fullmatch(stripped):
stats["chrome_lines"] += 1
continue
out.append(line)
cleaned = "\n".join(out)
# 4. Collapse 3+ consecutive blank lines to a single paragraph break
blank_run_matches = re.findall(r"\n{3,}", cleaned)
stats["blank_runs_collapsed"] = len(blank_run_matches)
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
stats["output_chars"] = len(cleaned)
stats["chars_removed"] = stats["input_chars"] - stats["output_chars"]
return cleaned, stats
def format_stats(stats: dict, label: str = "") -> str:
"""Format a stats dict into a single line for build-script logging."""
bits = []
for key in ("page_numbers", "page_headers", "chrome_lines",
"boilerplate_lines", "control_chars", "blank_runs_collapsed"):
val = stats.get(key, 0)
if val:
bits.append(f"{val} {key.replace('_', '-')}")
if not bits:
return f" ✂ Cleaned: nothing to remove ({label})" if label else " ✂ Cleaned: nothing to remove"
summary = ", ".join(bits)
chars = stats.get("chars_removed", 0)
pct = 100 * chars / stats["input_chars"] if stats.get("input_chars") else 0
prefix = f" ✂ Cleaned ({label}): " if label else " ✂ Cleaned: "
return f"{prefix}{summary} → -{chars:,} chars (-{pct:.1f}%)"
|