| | """ |
| | Text preprocessing for PubGuard. |
| | |
| | Designed for text *already extracted from PDFs* (e.g. via pdfplumber, |
| | PyMuPDF, or GROBID in the PubVerse pipeline). Focuses on cleaning |
| | OCR / layout artefacts and producing a compact representation that |
| | captures enough signal for the three classification heads. |
| | """ |
| |
|
| | import re |
| | from typing import Optional |
| |
|
| | |
| |
|
| | _WHITESPACE = re.compile(r"\s+") |
| | _HEADER_JUNK = re.compile( |
| | r"(doi:\s*\S+|https?://\S+|Β©\s*\d{4}|all rights reserved)", |
| | re.IGNORECASE, |
| | ) |
| | _PAGE_NUMBER = re.compile(r"\n\s*\d{1,4}\s*\n") |
| | _LIGATURE = re.compile(r"[ο¬ο¬ο¬ο¬ο¬]") |
| |
|
| | |
| | SECTION_HEADINGS = re.compile( |
| | r"\b(abstract|introduction|methods?|methodology|results|discussion|" |
| | r"conclusions?|references|bibliography|acknowledgments?|funding|" |
| | r"supplementary|materials?\s+and\s+methods?|related\s+work|" |
| | r"background|literature\s+review|experimental|data\s+availability)\b", |
| | re.IGNORECASE, |
| | ) |
| |
|
| | CITATION_PATTERN = re.compile( |
| | r"\[\d+\]|\(\w+\s+et\s+al\.\s*,?\s*\d{4}\)|\(\w+,\s*\d{4}\)", |
| | ) |
| |
|
| |
|
| | def clean_text(text: Optional[str], max_chars: int = 4000) -> str: |
| | """ |
| | Normalise raw PDF-extracted text for embedding. |
| | |
| | Steps: |
| | 1. Replace ligatures with ASCII equivalents. |
| | 2. Strip DOIs, URLs, copyright lines. |
| | 3. Remove isolated page numbers. |
| | 4. Collapse whitespace. |
| | 5. Truncate to `max_chars`. |
| | """ |
| | if not text: |
| | return "" |
| |
|
| | if not isinstance(text, str): |
| | text = str(text) |
| |
|
| | |
| | text = _LIGATURE.sub(lambda m: { |
| | "ο¬": "fi", "ο¬": "fl", "ο¬": "ff", "ο¬": "ffi", "ο¬": "ffl" |
| | }.get(m.group(), m.group()), text) |
| |
|
| | text = _HEADER_JUNK.sub(" ", text) |
| | text = _PAGE_NUMBER.sub("\n", text) |
| | text = _WHITESPACE.sub(" ", text).strip() |
| |
|
| | return text[:max_chars] |
| |
|
| |
|
| | def extract_structural_features(text: str) -> dict: |
| | """ |
| | Cheap heuristic features that augment the embedding signal. |
| | |
| | Returns a dict of float features (0-1 range) that the linear |
| | head can concatenate with the embedding vector. |
| | """ |
| | if not text: |
| | return _empty_features() |
| |
|
| | n_chars = len(text) |
| | n_words = len(text.split()) |
| |
|
| | |
| | headings = SECTION_HEADINGS.findall(text) |
| | unique_headings = set(h.lower() for h in headings) |
| |
|
| | |
| | citations = CITATION_PATTERN.findall(text) |
| |
|
| | |
| | alpha = sum(c.isalpha() for c in text) |
| | digit = sum(c.isdigit() for c in text) |
| | upper = sum(c.isupper() for c in text) |
| |
|
| | return { |
| | |
| | "log_chars": min(1.0, len(text) / 4000), |
| | "log_words": min(1.0, n_words / 800), |
| |
|
| | |
| | "n_unique_sections": min(1.0, len(unique_headings) / 8), |
| | "has_abstract": float("abstract" in unique_headings), |
| | "has_methods": float(bool(unique_headings & {"methods", "methodology", "materials and methods"})), |
| | "has_references": float(bool(unique_headings & {"references", "bibliography"})), |
| | "has_introduction": float("introduction" in unique_headings), |
| | "has_results": float("results" in unique_headings), |
| | "has_discussion": float("discussion" in unique_headings), |
| |
|
| | |
| | "citation_density": min(1.0, len(citations) / max(n_words, 1) * 100), |
| |
|
| | |
| | "alpha_ratio": alpha / max(n_chars, 1), |
| | "digit_ratio": digit / max(n_chars, 1), |
| | "upper_ratio": upper / max(alpha, 1), |
| |
|
| | |
| | "mean_sentence_len": min(1.0, _mean_sentence_length(text) / 50), |
| | } |
| |
|
| |
|
| | def _mean_sentence_length(text: str) -> float: |
| | """Average words per sentence (rough split on .!?).""" |
| | sentences = re.split(r"[.!?]+", text) |
| | sentences = [s.strip() for s in sentences if s.strip()] |
| | if not sentences: |
| | return 0.0 |
| | return sum(len(s.split()) for s in sentences) / len(sentences) |
| |
|
| |
|
| | def _empty_features() -> dict: |
| | return { |
| | "log_chars": 0.0, "log_words": 0.0, |
| | "n_unique_sections": 0.0, |
| | "has_abstract": 0.0, "has_methods": 0.0, |
| | "has_references": 0.0, "has_introduction": 0.0, |
| | "has_results": 0.0, "has_discussion": 0.0, |
| | "citation_density": 0.0, |
| | "alpha_ratio": 0.0, "digit_ratio": 0.0, "upper_ratio": 0.0, |
| | "mean_sentence_len": 0.0, |
| | } |
| |
|
| |
|
| | STRUCTURAL_FEATURE_NAMES = list(_empty_features().keys()) |
| | N_STRUCTURAL_FEATURES = len(STRUCTURAL_FEATURE_NAMES) |
| |
|