jimnoneill's picture
Upload src/pubguard/text.py with huggingface_hub
23c2fec verified
"""
Text preprocessing for PubGuard.
Designed for text *already extracted from PDFs* (e.g. via pdfplumber,
PyMuPDF, or GROBID in the PubVerse pipeline). Focuses on cleaning
OCR / layout artefacts and producing a compact representation that
captures enough signal for the three classification heads.
"""
import re
from typing import Optional
# ── Compiled patterns ────────────────────────────────────────────
_WHITESPACE = re.compile(r"\s+")
_HEADER_JUNK = re.compile(
r"(doi:\s*\S+|https?://\S+|Β©\s*\d{4}|all rights reserved)",
re.IGNORECASE,
)
_PAGE_NUMBER = re.compile(r"\n\s*\d{1,4}\s*\n")
_LIGATURE = re.compile(r"[fiflffffiffl]")
# Structural markers we look for to characterise document type
SECTION_HEADINGS = re.compile(
r"\b(abstract|introduction|methods?|methodology|results|discussion|"
r"conclusions?|references|bibliography|acknowledgments?|funding|"
r"supplementary|materials?\s+and\s+methods?|related\s+work|"
r"background|literature\s+review|experimental|data\s+availability)\b",
re.IGNORECASE,
)
CITATION_PATTERN = re.compile(
r"\[\d+\]|\(\w+\s+et\s+al\.\s*,?\s*\d{4}\)|\(\w+,\s*\d{4}\)",
)
def clean_text(text: Optional[str], max_chars: int = 4000) -> str:
"""
Normalise raw PDF-extracted text for embedding.
Steps:
1. Replace ligatures with ASCII equivalents.
2. Strip DOIs, URLs, copyright lines.
3. Remove isolated page numbers.
4. Collapse whitespace.
5. Truncate to `max_chars`.
"""
if not text:
return ""
if not isinstance(text, str):
text = str(text)
# Ligatures
text = _LIGATURE.sub(lambda m: {
"fi": "fi", "fl": "fl", "ff": "ff", "ffi": "ffi", "ffl": "ffl"
}.get(m.group(), m.group()), text)
text = _HEADER_JUNK.sub(" ", text)
text = _PAGE_NUMBER.sub("\n", text)
text = _WHITESPACE.sub(" ", text).strip()
return text[:max_chars]
def extract_structural_features(text: str) -> dict:
"""
Cheap heuristic features that augment the embedding signal.
Returns a dict of float features (0-1 range) that the linear
head can concatenate with the embedding vector.
"""
if not text:
return _empty_features()
n_chars = len(text)
n_words = len(text.split())
# Section heading density
headings = SECTION_HEADINGS.findall(text)
unique_headings = set(h.lower() for h in headings)
# Citation density
citations = CITATION_PATTERN.findall(text)
# Character-level ratios
alpha = sum(c.isalpha() for c in text)
digit = sum(c.isdigit() for c in text)
upper = sum(c.isupper() for c in text)
return {
# Document length signals (log-scaled, clipped)
"log_chars": min(1.0, len(text) / 4000),
"log_words": min(1.0, n_words / 800),
# Structure signals
"n_unique_sections": min(1.0, len(unique_headings) / 8),
"has_abstract": float("abstract" in unique_headings),
"has_methods": float(bool(unique_headings & {"methods", "methodology", "materials and methods"})),
"has_references": float(bool(unique_headings & {"references", "bibliography"})),
"has_introduction": float("introduction" in unique_headings),
"has_results": float("results" in unique_headings),
"has_discussion": float("discussion" in unique_headings),
# Citation density
"citation_density": min(1.0, len(citations) / max(n_words, 1) * 100),
# Character composition
"alpha_ratio": alpha / max(n_chars, 1),
"digit_ratio": digit / max(n_chars, 1),
"upper_ratio": upper / max(alpha, 1),
# Mean sentence length (proxy for formality)
"mean_sentence_len": min(1.0, _mean_sentence_length(text) / 50),
}
def _mean_sentence_length(text: str) -> float:
"""Average words per sentence (rough split on .!?)."""
sentences = re.split(r"[.!?]+", text)
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
return 0.0
return sum(len(s.split()) for s in sentences) / len(sentences)
def _empty_features() -> dict:
return {
"log_chars": 0.0, "log_words": 0.0,
"n_unique_sections": 0.0,
"has_abstract": 0.0, "has_methods": 0.0,
"has_references": 0.0, "has_introduction": 0.0,
"has_results": 0.0, "has_discussion": 0.0,
"citation_density": 0.0,
"alpha_ratio": 0.0, "digit_ratio": 0.0, "upper_ratio": 0.0,
"mean_sentence_len": 0.0,
}
STRUCTURAL_FEATURE_NAMES = list(_empty_features().keys())
N_STRUCTURAL_FEATURES = len(STRUCTURAL_FEATURE_NAMES)