Spaces:

webmuppetnz
/

hmc-rag

Running

File size: 8,832 Bytes

bad8b6c

"""
Strip PDF/HTML extraction artefacts from corpus markdown.

Three classes of noise are addressed:

1. **Hard rules** — always stripped:
   - Control characters (form-feeds, NULs, etc. that escape from PDF parsing)
   - Standalone page numbers (`^\\s*\\d+\\s*$`)
   - Page header/footer patterns (`X | 37`, `X | P a g e`)
   - Known UI chrome (Material Design icon labels, TOC nav arrows, video transcript markers)

2. **Auto-detected boilerplate** — stripped with a high-confidence threshold:
   - Any line of 12–100 chars that appears more than 10 times in a single document
   - Skips lines that look like markdown headings, metadata key:value lines, or list markers
   - Catches "Health Information Privacy Code 2020" repeating 27× as a page header

3. **Cosmetic cleanup** — collapse runs of 3+ blank lines to a paragraph break.

Returns (cleaned_text, stats_dict) so the caller can log what was removed.

Used by all `build_*_compilation.py` scripts via `from clean_artifacts import clean_corpus_artifacts`.
"""

from __future__ import annotations

import re
from collections import Counter

# ---- Hard-coded chrome patterns -----------------------------------

# Each pattern is matched as a *whole-line* fullmatch (case-insensitive)
# against the stripped line content. If any pattern matches, the line is dropped.
_CHROME_PATTERNS = [
    # Standalone page number
    r"\d+",

    # Material Design icon labels (text content of <i class="material-icons"> tags).
    # Markitdown sometimes renders underscores as `\_` (markdown-escaped) — match either.
    r"expand\\?_(more|less)",
    r"chevron\\?_(right|left)",
    r"arrow\\?_(forward|back|upward|downward|drop\\?_down|drop\\?_up)",
    r"menu",
    r"close",
    r"search",
    r"add|remove",
    r"more\\?_(vert|horiz)",

    # Video transcript markup (HDC and other agency video pages)
    r"#{1,6}\s*Visual",
    r"#{1,6}\s*Transcript",

    # Page header/footer patterns
    r".{1,80}\|\s*\d+",                             # "X | 37"
    r".{1,80}\|\s*P\s*a\s*g\s*e",                   # "X | P a g e"

    # Navigation chrome
    r"[‹›«»<>]+\s*Back(?:\s*to\s*contents)?",       # "‹ Back to contents"
    r"[‹›«»<>]+\s*(Previous|Next)(?:\s*page)?",
    r"Back\s*to\s*top",
    r"Continued\s*overleaf",
    r"Continued\s*on\s*next\s*page",
    r"Skip\s*to\s*(main\s*content|content)",
]

_CHROME_REGEX = re.compile(
    "|".join(f"(?:{p})" for p in _CHROME_PATTERNS),
    re.IGNORECASE,
)

# List-prefixed chrome: Material Design icon labels rendered as markdown
# list items, e.g. `+ chevron\_left`, `- expand_more`. These survive the
# main `_CHROME_REGEX` because that regex matches the whole stripped line,
# and `clean_artifacts` deliberately skips list-prefixed lines for safety
# (so it doesn't strip legitimate sub-bullets in legislation). This catalog
# explicitly handles the case where the list prefix decorates known chrome.
_LIST_CHROME_PATTERNS = [
    r"[*+\-]\s+expand\\?_(more|less)",
    r"[*+\-]\s+chevron\\?_(right|left)",
    r"[*+\-]\s+arrow\\?_(forward|back|upward|downward|drop\\?_down|drop\\?_up)",
    r"[*+\-]\s+(menu|close|search)",
    r"[*+\-]\s+more\\?_(vert|horiz)",
    r"[*+\-]\s+(add|remove)",
]

_LIST_CHROME_REGEX = re.compile(
    "|".join(f"(?:{p})" for p in _LIST_CHROME_PATTERNS),
    re.IGNORECASE,
)

# Control characters: everything < 0x20 except tab (\x09), newline (\x0a), CR (\x0d)
_CONTROL_CHAR_REGEX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")


def clean_corpus_artifacts(
    text: str,
    repetition_threshold: int = 10,
    min_repeated_len: int = 12,
    max_repeated_len: int = 100,
) -> tuple[str, dict]:
    """Strip page artefacts, UI chrome, control characters, and repeated boilerplate.

    Args:
        text: raw markdown extracted from PDF/HTML
        repetition_threshold: how many times a line must repeat to count as boilerplate
        min_repeated_len: shortest line considered for boilerplate detection (avoids
            stripping legitimate short content like "(1)" or section markers)
        max_repeated_len: longest line considered for boilerplate detection (avoids
            accidentally stripping repeated-prose paragraphs)

    Returns:
        (cleaned_text, stats_dict)
    """
    stats = {
        "input_chars": len(text),
        "control_chars": 0,
        "page_numbers": 0,
        "page_headers": 0,
        "chrome_lines": 0,
        "boilerplate_lines": 0,
        "blank_runs_collapsed": 0,
        "boilerplate_strings": [],
    }

    # 1. Strip control characters
    cleaned = _CONTROL_CHAR_REGEX.sub("", text)
    stats["control_chars"] = len(text) - len(cleaned)

    # 2. Detect repeated boilerplate via frequency analysis
    lines = cleaned.split("\n")
    line_counter = Counter(l.strip() for l in lines if l.strip())
    boilerplate = set()
    for line, count in line_counter.items():
        if count < repetition_threshold:
            continue
        if not (min_repeated_len <= len(line) <= max_repeated_len):
            continue
        # Word-count guard: chrome typically has 3+ words ("Page 37 | Title", "Back
        # to contents", "Health Information Privacy Code 2020"). Sentence-opener
        # fragments like "Advertisements must" or "Practitioners should" are 1-2
        # words and would falsely match if repeated across many rules.
        if len(line.split()) < 3:
            continue
        # Skip likely-legitimate content patterns
        if line.startswith("#"):                    # markdown heading
            continue
        if re.match(r"^[a-z_]+:\s+\S", line):       # metadata "key: value"
            continue
        if line.startswith("Source:"):              # URL source marker
            continue
        if line.startswith(("- ", "* ", "+ ")):     # list items
            continue
        if line.startswith(("(", "[")) and len(line) < 30:  # subsection markers like "(1) blah"
            continue
        # Skip lines that look like the start of a sentence wrapped over a line
        # boundary: ends with no terminal punctuation AND no pipe-style header marker.
        # This catches the "Advertisements must" / "Practitioners should" case
        # without false-flagging legitimate page headers (which usually contain a `|`
        # or a year/number, both surviving this check).
        if not re.search(r"[.!?:]$|\|\s*\d|\d{4}\s*$|[‹›«»>]", line):
            continue
        boilerplate.add(line)

    stats["boilerplate_strings"] = sorted(boilerplate)

    # 3. Strip chrome + boilerplate lines line-by-line
    out = []
    for line in lines:
        stripped = line.strip()

        # Always keep blank lines (collapse them in step 4)
        if not stripped:
            out.append(line)
            continue

        # Boilerplate?
        if stripped in boilerplate:
            stats["boilerplate_lines"] += 1
            continue

        # Standalone page number?
        if re.fullmatch(r"\d+", stripped):
            stats["page_numbers"] += 1
            continue

        # Page header/footer with pipe-number pattern?
        if re.fullmatch(r".{1,80}\|\s*(?:\d+|P\s*a\s*g\s*e)\s*", stripped, re.IGNORECASE):
            stats["page_headers"] += 1
            continue

        # Other chrome?
        if _CHROME_REGEX.fullmatch(stripped):
            stats["chrome_lines"] += 1
            continue

        # List-prefixed chrome (Material Design icons rendered as markdown list items)?
        if _LIST_CHROME_REGEX.fullmatch(stripped):
            stats["chrome_lines"] += 1
            continue

        out.append(line)

    cleaned = "\n".join(out)

    # 4. Collapse 3+ consecutive blank lines to a single paragraph break
    blank_run_matches = re.findall(r"\n{3,}", cleaned)
    stats["blank_runs_collapsed"] = len(blank_run_matches)
    cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)

    stats["output_chars"] = len(cleaned)
    stats["chars_removed"] = stats["input_chars"] - stats["output_chars"]
    return cleaned, stats


def format_stats(stats: dict, label: str = "") -> str:
    """Format a stats dict into a single line for build-script logging."""
    bits = []
    for key in ("page_numbers", "page_headers", "chrome_lines",
                "boilerplate_lines", "control_chars", "blank_runs_collapsed"):
        val = stats.get(key, 0)
        if val:
            bits.append(f"{val} {key.replace('_', '-')}")
    if not bits:
        return f"  ✂ Cleaned: nothing to remove ({label})" if label else "  ✂ Cleaned: nothing to remove"
    summary = ", ".join(bits)
    chars = stats.get("chars_removed", 0)
    pct = 100 * chars / stats["input_chars"] if stats.get("input_chars") else 0
    prefix = f"  ✂ Cleaned ({label}): " if label else "  ✂ Cleaned: "
    return f"{prefix}{summary} → -{chars:,} chars (-{pct:.1f}%)"