Upload src/pubguard/text.py with huggingface_hub
Browse files- src/pubguard/text.py +141 -0
src/pubguard/text.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text preprocessing for PubGuard.
|
| 3 |
+
|
| 4 |
+
Designed for text *already extracted from PDFs* (e.g. via pdfplumber,
|
| 5 |
+
PyMuPDF, or GROBID in the PubVerse pipeline). Focuses on cleaning
|
| 6 |
+
OCR / layout artefacts and producing a compact representation that
|
| 7 |
+
captures enough signal for the three classification heads.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import re
|
| 11 |
+
from typing import Optional
|
| 12 |
+
|
| 13 |
+
# ββ Compiled patterns ββββββββββββββββββββββββββββββββββββββββββββ
|
| 14 |
+
|
| 15 |
+
_WHITESPACE = re.compile(r"\s+")
|
| 16 |
+
_HEADER_JUNK = re.compile(
|
| 17 |
+
r"(doi:\s*\S+|https?://\S+|Β©\s*\d{4}|all rights reserved)",
|
| 18 |
+
re.IGNORECASE,
|
| 19 |
+
)
|
| 20 |
+
_PAGE_NUMBER = re.compile(r"\n\s*\d{1,4}\s*\n")
|
| 21 |
+
_LIGATURE = re.compile(r"[ο¬ο¬ο¬ο¬ο¬]")
|
| 22 |
+
|
| 23 |
+
# Structural markers we look for to characterise document type
|
| 24 |
+
SECTION_HEADINGS = re.compile(
|
| 25 |
+
r"\b(abstract|introduction|methods?|methodology|results|discussion|"
|
| 26 |
+
r"conclusions?|references|bibliography|acknowledgments?|funding|"
|
| 27 |
+
r"supplementary|materials?\s+and\s+methods?|related\s+work|"
|
| 28 |
+
r"background|literature\s+review|experimental|data\s+availability)\b",
|
| 29 |
+
re.IGNORECASE,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
CITATION_PATTERN = re.compile(
|
| 33 |
+
r"\[\d+\]|\(\w+\s+et\s+al\.\s*,?\s*\d{4}\)|\(\w+,\s*\d{4}\)",
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def clean_text(text: Optional[str], max_chars: int = 4000) -> str:
|
| 38 |
+
"""
|
| 39 |
+
Normalise raw PDF-extracted text for embedding.
|
| 40 |
+
|
| 41 |
+
Steps:
|
| 42 |
+
1. Replace ligatures with ASCII equivalents.
|
| 43 |
+
2. Strip DOIs, URLs, copyright lines.
|
| 44 |
+
3. Remove isolated page numbers.
|
| 45 |
+
4. Collapse whitespace.
|
| 46 |
+
5. Truncate to `max_chars`.
|
| 47 |
+
"""
|
| 48 |
+
if not text:
|
| 49 |
+
return ""
|
| 50 |
+
|
| 51 |
+
if not isinstance(text, str):
|
| 52 |
+
text = str(text)
|
| 53 |
+
|
| 54 |
+
# Ligatures
|
| 55 |
+
text = _LIGATURE.sub(lambda m: {
|
| 56 |
+
"ο¬": "fi", "ο¬": "fl", "ο¬": "ff", "ο¬": "ffi", "ο¬": "ffl"
|
| 57 |
+
}.get(m.group(), m.group()), text)
|
| 58 |
+
|
| 59 |
+
text = _HEADER_JUNK.sub(" ", text)
|
| 60 |
+
text = _PAGE_NUMBER.sub("\n", text)
|
| 61 |
+
text = _WHITESPACE.sub(" ", text).strip()
|
| 62 |
+
|
| 63 |
+
return text[:max_chars]
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def extract_structural_features(text: str) -> dict:
|
| 67 |
+
"""
|
| 68 |
+
Cheap heuristic features that augment the embedding signal.
|
| 69 |
+
|
| 70 |
+
Returns a dict of float features (0-1 range) that the linear
|
| 71 |
+
head can concatenate with the embedding vector.
|
| 72 |
+
"""
|
| 73 |
+
if not text:
|
| 74 |
+
return _empty_features()
|
| 75 |
+
|
| 76 |
+
n_chars = len(text)
|
| 77 |
+
n_words = len(text.split())
|
| 78 |
+
|
| 79 |
+
# Section heading density
|
| 80 |
+
headings = SECTION_HEADINGS.findall(text)
|
| 81 |
+
unique_headings = set(h.lower() for h in headings)
|
| 82 |
+
|
| 83 |
+
# Citation density
|
| 84 |
+
citations = CITATION_PATTERN.findall(text)
|
| 85 |
+
|
| 86 |
+
# Character-level ratios
|
| 87 |
+
alpha = sum(c.isalpha() for c in text)
|
| 88 |
+
digit = sum(c.isdigit() for c in text)
|
| 89 |
+
upper = sum(c.isupper() for c in text)
|
| 90 |
+
|
| 91 |
+
return {
|
| 92 |
+
# Document length signals (log-scaled, clipped)
|
| 93 |
+
"log_chars": min(1.0, len(text) / 4000),
|
| 94 |
+
"log_words": min(1.0, n_words / 800),
|
| 95 |
+
|
| 96 |
+
# Structure signals
|
| 97 |
+
"n_unique_sections": min(1.0, len(unique_headings) / 8),
|
| 98 |
+
"has_abstract": float("abstract" in unique_headings),
|
| 99 |
+
"has_methods": float(bool(unique_headings & {"methods", "methodology", "materials and methods"})),
|
| 100 |
+
"has_references": float(bool(unique_headings & {"references", "bibliography"})),
|
| 101 |
+
"has_introduction": float("introduction" in unique_headings),
|
| 102 |
+
"has_results": float("results" in unique_headings),
|
| 103 |
+
"has_discussion": float("discussion" in unique_headings),
|
| 104 |
+
|
| 105 |
+
# Citation density
|
| 106 |
+
"citation_density": min(1.0, len(citations) / max(n_words, 1) * 100),
|
| 107 |
+
|
| 108 |
+
# Character composition
|
| 109 |
+
"alpha_ratio": alpha / max(n_chars, 1),
|
| 110 |
+
"digit_ratio": digit / max(n_chars, 1),
|
| 111 |
+
"upper_ratio": upper / max(alpha, 1),
|
| 112 |
+
|
| 113 |
+
# Mean sentence length (proxy for formality)
|
| 114 |
+
"mean_sentence_len": min(1.0, _mean_sentence_length(text) / 50),
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _mean_sentence_length(text: str) -> float:
|
| 119 |
+
"""Average words per sentence (rough split on .!?)."""
|
| 120 |
+
sentences = re.split(r"[.!?]+", text)
|
| 121 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
| 122 |
+
if not sentences:
|
| 123 |
+
return 0.0
|
| 124 |
+
return sum(len(s.split()) for s in sentences) / len(sentences)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def _empty_features() -> dict:
|
| 128 |
+
return {
|
| 129 |
+
"log_chars": 0.0, "log_words": 0.0,
|
| 130 |
+
"n_unique_sections": 0.0,
|
| 131 |
+
"has_abstract": 0.0, "has_methods": 0.0,
|
| 132 |
+
"has_references": 0.0, "has_introduction": 0.0,
|
| 133 |
+
"has_results": 0.0, "has_discussion": 0.0,
|
| 134 |
+
"citation_density": 0.0,
|
| 135 |
+
"alpha_ratio": 0.0, "digit_ratio": 0.0, "upper_ratio": 0.0,
|
| 136 |
+
"mean_sentence_len": 0.0,
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
STRUCTURAL_FEATURE_NAMES = list(_empty_features().keys())
|
| 141 |
+
N_STRUCTURAL_FEATURES = len(STRUCTURAL_FEATURE_NAMES)
|