""" Text preprocessing for PubGuard. Designed for text *already extracted from PDFs* (e.g. via pdfplumber, PyMuPDF, or GROBID in the PubVerse pipeline). Focuses on cleaning OCR / layout artefacts and producing a compact representation that captures enough signal for the three classification heads. """ import re from typing import Optional # ── Compiled patterns ──────────────────────────────────────────── _WHITESPACE = re.compile(r"\s+") _HEADER_JUNK = re.compile( r"(doi:\s*\S+|https?://\S+|©\s*\d{4}|all rights reserved)", re.IGNORECASE, ) _PAGE_NUMBER = re.compile(r"\n\s*\d{1,4}\s*\n") _LIGATURE = re.compile(r"[fiflffffiffl]") # Structural markers we look for to characterise document type SECTION_HEADINGS = re.compile( r"\b(abstract|introduction|methods?|methodology|results|discussion|" r"conclusions?|references|bibliography|acknowledgments?|funding|" r"supplementary|materials?\s+and\s+methods?|related\s+work|" r"background|literature\s+review|experimental|data\s+availability)\b", re.IGNORECASE, ) CITATION_PATTERN = re.compile( r"\[\d+\]|\(\w+\s+et\s+al\.\s*,?\s*\d{4}\)|\(\w+,\s*\d{4}\)", ) def clean_text(text: Optional[str], max_chars: int = 4000) -> str: """ Normalise raw PDF-extracted text for embedding. Steps: 1. Replace ligatures with ASCII equivalents. 2. Strip DOIs, URLs, copyright lines. 3. Remove isolated page numbers. 4. Collapse whitespace. 5. Truncate to `max_chars`. """ if not text: return "" if not isinstance(text, str): text = str(text) # Ligatures text = _LIGATURE.sub(lambda m: { "fi": "fi", "fl": "fl", "ff": "ff", "ffi": "ffi", "ffl": "ffl" }.get(m.group(), m.group()), text) text = _HEADER_JUNK.sub(" ", text) text = _PAGE_NUMBER.sub("\n", text) text = _WHITESPACE.sub(" ", text).strip() return text[:max_chars] def extract_structural_features(text: str) -> dict: """ Cheap heuristic features that augment the embedding signal. Returns a dict of float features (0-1 range) that the linear head can concatenate with the embedding vector. """ if not text: return _empty_features() n_chars = len(text) n_words = len(text.split()) # Section heading density headings = SECTION_HEADINGS.findall(text) unique_headings = set(h.lower() for h in headings) # Citation density citations = CITATION_PATTERN.findall(text) # Character-level ratios alpha = sum(c.isalpha() for c in text) digit = sum(c.isdigit() for c in text) upper = sum(c.isupper() for c in text) return { # Document length signals (log-scaled, clipped) "log_chars": min(1.0, len(text) / 4000), "log_words": min(1.0, n_words / 800), # Structure signals "n_unique_sections": min(1.0, len(unique_headings) / 8), "has_abstract": float("abstract" in unique_headings), "has_methods": float(bool(unique_headings & {"methods", "methodology", "materials and methods"})), "has_references": float(bool(unique_headings & {"references", "bibliography"})), "has_introduction": float("introduction" in unique_headings), "has_results": float("results" in unique_headings), "has_discussion": float("discussion" in unique_headings), # Citation density "citation_density": min(1.0, len(citations) / max(n_words, 1) * 100), # Character composition "alpha_ratio": alpha / max(n_chars, 1), "digit_ratio": digit / max(n_chars, 1), "upper_ratio": upper / max(alpha, 1), # Mean sentence length (proxy for formality) "mean_sentence_len": min(1.0, _mean_sentence_length(text) / 50), } def _mean_sentence_length(text: str) -> float: """Average words per sentence (rough split on .!?).""" sentences = re.split(r"[.!?]+", text) sentences = [s.strip() for s in sentences if s.strip()] if not sentences: return 0.0 return sum(len(s.split()) for s in sentences) / len(sentences) def _empty_features() -> dict: return { "log_chars": 0.0, "log_words": 0.0, "n_unique_sections": 0.0, "has_abstract": 0.0, "has_methods": 0.0, "has_references": 0.0, "has_introduction": 0.0, "has_results": 0.0, "has_discussion": 0.0, "citation_density": 0.0, "alpha_ratio": 0.0, "digit_ratio": 0.0, "upper_ratio": 0.0, "mean_sentence_len": 0.0, } STRUCTURAL_FEATURE_NAMES = list(_empty_features().keys()) N_STRUCTURAL_FEATURES = len(STRUCTURAL_FEATURE_NAMES)