|
|
""" |
|
|
Text preprocessing for PubGuard. |
|
|
|
|
|
Designed for text *already extracted from PDFs* (e.g. via pdfplumber, |
|
|
PyMuPDF, or GROBID in the PubVerse pipeline). Focuses on cleaning |
|
|
OCR / layout artefacts and producing a compact representation that |
|
|
captures enough signal for the three classification heads. |
|
|
""" |
|
|
|
|
|
import re |
|
|
from typing import Optional |
|
|
|
|
|
|
|
|
|
|
|
_WHITESPACE = re.compile(r"\s+") |
|
|
_HEADER_JUNK = re.compile( |
|
|
r"(doi:\s*\S+|https?://\S+|Β©\s*\d{4}|all rights reserved)", |
|
|
re.IGNORECASE, |
|
|
) |
|
|
_PAGE_NUMBER = re.compile(r"\n\s*\d{1,4}\s*\n") |
|
|
_LIGATURE = re.compile(r"[ο¬ο¬ο¬ο¬ο¬]") |
|
|
|
|
|
|
|
|
SECTION_HEADINGS = re.compile( |
|
|
r"\b(abstract|introduction|methods?|methodology|results|discussion|" |
|
|
r"conclusions?|references|bibliography|acknowledgments?|funding|" |
|
|
r"supplementary|materials?\s+and\s+methods?|related\s+work|" |
|
|
r"background|literature\s+review|experimental|data\s+availability)\b", |
|
|
re.IGNORECASE, |
|
|
) |
|
|
|
|
|
CITATION_PATTERN = re.compile( |
|
|
r"\[\d+\]|\(\w+\s+et\s+al\.\s*,?\s*\d{4}\)|\(\w+,\s*\d{4}\)", |
|
|
) |
|
|
|
|
|
|
|
|
def clean_text(text: Optional[str], max_chars: int = 4000) -> str: |
|
|
""" |
|
|
Normalise raw PDF-extracted text for embedding. |
|
|
|
|
|
Steps: |
|
|
1. Replace ligatures with ASCII equivalents. |
|
|
2. Strip DOIs, URLs, copyright lines. |
|
|
3. Remove isolated page numbers. |
|
|
4. Collapse whitespace. |
|
|
5. Truncate to `max_chars`. |
|
|
""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
if not isinstance(text, str): |
|
|
text = str(text) |
|
|
|
|
|
|
|
|
text = _LIGATURE.sub(lambda m: { |
|
|
"ο¬": "fi", "ο¬": "fl", "ο¬": "ff", "ο¬": "ffi", "ο¬": "ffl" |
|
|
}.get(m.group(), m.group()), text) |
|
|
|
|
|
text = _HEADER_JUNK.sub(" ", text) |
|
|
text = _PAGE_NUMBER.sub("\n", text) |
|
|
text = _WHITESPACE.sub(" ", text).strip() |
|
|
|
|
|
return text[:max_chars] |
|
|
|
|
|
|
|
|
def extract_structural_features(text: str) -> dict: |
|
|
""" |
|
|
Cheap heuristic features that augment the embedding signal. |
|
|
|
|
|
Returns a dict of float features (0-1 range) that the linear |
|
|
head can concatenate with the embedding vector. |
|
|
""" |
|
|
if not text: |
|
|
return _empty_features() |
|
|
|
|
|
n_chars = len(text) |
|
|
n_words = len(text.split()) |
|
|
|
|
|
|
|
|
headings = SECTION_HEADINGS.findall(text) |
|
|
unique_headings = set(h.lower() for h in headings) |
|
|
|
|
|
|
|
|
citations = CITATION_PATTERN.findall(text) |
|
|
|
|
|
|
|
|
alpha = sum(c.isalpha() for c in text) |
|
|
digit = sum(c.isdigit() for c in text) |
|
|
upper = sum(c.isupper() for c in text) |
|
|
|
|
|
return { |
|
|
|
|
|
"log_chars": min(1.0, len(text) / 4000), |
|
|
"log_words": min(1.0, n_words / 800), |
|
|
|
|
|
|
|
|
"n_unique_sections": min(1.0, len(unique_headings) / 8), |
|
|
"has_abstract": float("abstract" in unique_headings), |
|
|
"has_methods": float(bool(unique_headings & {"methods", "methodology", "materials and methods"})), |
|
|
"has_references": float(bool(unique_headings & {"references", "bibliography"})), |
|
|
"has_introduction": float("introduction" in unique_headings), |
|
|
"has_results": float("results" in unique_headings), |
|
|
"has_discussion": float("discussion" in unique_headings), |
|
|
|
|
|
|
|
|
"citation_density": min(1.0, len(citations) / max(n_words, 1) * 100), |
|
|
|
|
|
|
|
|
"alpha_ratio": alpha / max(n_chars, 1), |
|
|
"digit_ratio": digit / max(n_chars, 1), |
|
|
"upper_ratio": upper / max(alpha, 1), |
|
|
|
|
|
|
|
|
"mean_sentence_len": min(1.0, _mean_sentence_length(text) / 50), |
|
|
} |
|
|
|
|
|
|
|
|
def _mean_sentence_length(text: str) -> float: |
|
|
"""Average words per sentence (rough split on .!?).""" |
|
|
sentences = re.split(r"[.!?]+", text) |
|
|
sentences = [s.strip() for s in sentences if s.strip()] |
|
|
if not sentences: |
|
|
return 0.0 |
|
|
return sum(len(s.split()) for s in sentences) / len(sentences) |
|
|
|
|
|
|
|
|
def _empty_features() -> dict: |
|
|
return { |
|
|
"log_chars": 0.0, "log_words": 0.0, |
|
|
"n_unique_sections": 0.0, |
|
|
"has_abstract": 0.0, "has_methods": 0.0, |
|
|
"has_references": 0.0, "has_introduction": 0.0, |
|
|
"has_results": 0.0, "has_discussion": 0.0, |
|
|
"citation_density": 0.0, |
|
|
"alpha_ratio": 0.0, "digit_ratio": 0.0, "upper_ratio": 0.0, |
|
|
"mean_sentence_len": 0.0, |
|
|
} |
|
|
|
|
|
|
|
|
STRUCTURAL_FEATURE_NAMES = list(_empty_features().keys()) |
|
|
N_STRUCTURAL_FEATURES = len(STRUCTURAL_FEATURE_NAMES) |
|
|
|