jimnoneill commited on
Commit
23c2fec
Β·
verified Β·
1 Parent(s): e1fe580

Upload src/pubguard/text.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/pubguard/text.py +141 -0
src/pubguard/text.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text preprocessing for PubGuard.
3
+
4
+ Designed for text *already extracted from PDFs* (e.g. via pdfplumber,
5
+ PyMuPDF, or GROBID in the PubVerse pipeline). Focuses on cleaning
6
+ OCR / layout artefacts and producing a compact representation that
7
+ captures enough signal for the three classification heads.
8
+ """
9
+
10
+ import re
11
+ from typing import Optional
12
+
13
+ # ── Compiled patterns ────────────────────────────────────────────
14
+
15
+ _WHITESPACE = re.compile(r"\s+")
16
+ _HEADER_JUNK = re.compile(
17
+ r"(doi:\s*\S+|https?://\S+|Β©\s*\d{4}|all rights reserved)",
18
+ re.IGNORECASE,
19
+ )
20
+ _PAGE_NUMBER = re.compile(r"\n\s*\d{1,4}\s*\n")
21
+ _LIGATURE = re.compile(r"[fiflffffiffl]")
22
+
23
+ # Structural markers we look for to characterise document type
24
+ SECTION_HEADINGS = re.compile(
25
+ r"\b(abstract|introduction|methods?|methodology|results|discussion|"
26
+ r"conclusions?|references|bibliography|acknowledgments?|funding|"
27
+ r"supplementary|materials?\s+and\s+methods?|related\s+work|"
28
+ r"background|literature\s+review|experimental|data\s+availability)\b",
29
+ re.IGNORECASE,
30
+ )
31
+
32
+ CITATION_PATTERN = re.compile(
33
+ r"\[\d+\]|\(\w+\s+et\s+al\.\s*,?\s*\d{4}\)|\(\w+,\s*\d{4}\)",
34
+ )
35
+
36
+
37
+ def clean_text(text: Optional[str], max_chars: int = 4000) -> str:
38
+ """
39
+ Normalise raw PDF-extracted text for embedding.
40
+
41
+ Steps:
42
+ 1. Replace ligatures with ASCII equivalents.
43
+ 2. Strip DOIs, URLs, copyright lines.
44
+ 3. Remove isolated page numbers.
45
+ 4. Collapse whitespace.
46
+ 5. Truncate to `max_chars`.
47
+ """
48
+ if not text:
49
+ return ""
50
+
51
+ if not isinstance(text, str):
52
+ text = str(text)
53
+
54
+ # Ligatures
55
+ text = _LIGATURE.sub(lambda m: {
56
+ "fi": "fi", "fl": "fl", "ff": "ff", "ffi": "ffi", "ffl": "ffl"
57
+ }.get(m.group(), m.group()), text)
58
+
59
+ text = _HEADER_JUNK.sub(" ", text)
60
+ text = _PAGE_NUMBER.sub("\n", text)
61
+ text = _WHITESPACE.sub(" ", text).strip()
62
+
63
+ return text[:max_chars]
64
+
65
+
66
+ def extract_structural_features(text: str) -> dict:
67
+ """
68
+ Cheap heuristic features that augment the embedding signal.
69
+
70
+ Returns a dict of float features (0-1 range) that the linear
71
+ head can concatenate with the embedding vector.
72
+ """
73
+ if not text:
74
+ return _empty_features()
75
+
76
+ n_chars = len(text)
77
+ n_words = len(text.split())
78
+
79
+ # Section heading density
80
+ headings = SECTION_HEADINGS.findall(text)
81
+ unique_headings = set(h.lower() for h in headings)
82
+
83
+ # Citation density
84
+ citations = CITATION_PATTERN.findall(text)
85
+
86
+ # Character-level ratios
87
+ alpha = sum(c.isalpha() for c in text)
88
+ digit = sum(c.isdigit() for c in text)
89
+ upper = sum(c.isupper() for c in text)
90
+
91
+ return {
92
+ # Document length signals (log-scaled, clipped)
93
+ "log_chars": min(1.0, len(text) / 4000),
94
+ "log_words": min(1.0, n_words / 800),
95
+
96
+ # Structure signals
97
+ "n_unique_sections": min(1.0, len(unique_headings) / 8),
98
+ "has_abstract": float("abstract" in unique_headings),
99
+ "has_methods": float(bool(unique_headings & {"methods", "methodology", "materials and methods"})),
100
+ "has_references": float(bool(unique_headings & {"references", "bibliography"})),
101
+ "has_introduction": float("introduction" in unique_headings),
102
+ "has_results": float("results" in unique_headings),
103
+ "has_discussion": float("discussion" in unique_headings),
104
+
105
+ # Citation density
106
+ "citation_density": min(1.0, len(citations) / max(n_words, 1) * 100),
107
+
108
+ # Character composition
109
+ "alpha_ratio": alpha / max(n_chars, 1),
110
+ "digit_ratio": digit / max(n_chars, 1),
111
+ "upper_ratio": upper / max(alpha, 1),
112
+
113
+ # Mean sentence length (proxy for formality)
114
+ "mean_sentence_len": min(1.0, _mean_sentence_length(text) / 50),
115
+ }
116
+
117
+
118
+ def _mean_sentence_length(text: str) -> float:
119
+ """Average words per sentence (rough split on .!?)."""
120
+ sentences = re.split(r"[.!?]+", text)
121
+ sentences = [s.strip() for s in sentences if s.strip()]
122
+ if not sentences:
123
+ return 0.0
124
+ return sum(len(s.split()) for s in sentences) / len(sentences)
125
+
126
+
127
+ def _empty_features() -> dict:
128
+ return {
129
+ "log_chars": 0.0, "log_words": 0.0,
130
+ "n_unique_sections": 0.0,
131
+ "has_abstract": 0.0, "has_methods": 0.0,
132
+ "has_references": 0.0, "has_introduction": 0.0,
133
+ "has_results": 0.0, "has_discussion": 0.0,
134
+ "citation_density": 0.0,
135
+ "alpha_ratio": 0.0, "digit_ratio": 0.0, "upper_ratio": 0.0,
136
+ "mean_sentence_len": 0.0,
137
+ }
138
+
139
+
140
+ STRUCTURAL_FEATURE_NAMES = list(_empty_features().keys())
141
+ N_STRUCTURAL_FEATURES = len(STRUCTURAL_FEATURE_NAMES)