File size: 9,745 Bytes
ba415d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
"""
PosterSentry β€” Multimodal Scientific Poster Classifier
=======================================================

Architecture:
    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
    β”‚ PDF text β”‚    β”‚ PDF β†’ image  β”‚    β”‚ PDF structure  β”‚
    β””β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”˜    β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜    β””β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜
         β”‚                 β”‚                    β”‚
    model2vec         15 visual            15 structural
    β†’ 512-d emb       features             features
         β”‚                 β”‚                    β”‚
         β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
                  β”‚
          concat β†’ 542-d input
                  β”‚
          LogisticRegression
                  β”‚
         poster / non_poster

Single linear classifier on the concatenated feature vector.
Same paradigm as PubGuard β€” lightweight, CPU-only, fast.
"""

import logging
import time
from pathlib import Path
from typing import Any, Dict, List, Optional

import numpy as np

from .features import (
    VisualFeatureExtractor,
    PDFStructuralExtractor,
    N_VISUAL_FEATURES,
    N_STRUCTURAL_FEATURES,
)

logger = logging.getLogger(__name__)


class PosterSentry:
    """
    Multimodal poster classifier.

    Combines:
        - model2vec text embedding (512-d)
        - 15 visual features (color, edge, FFT, whitespace)
        - 15 structural features (page geometry, fonts, text blocks)

    into a single 542-d feature vector for logistic regression.
    """

    def __init__(
        self,
        model_name: str = "minishlab/potion-base-32M",
        models_dir: Optional[Path] = None,
    ):
        self.model_name = model_name
        self.models_dir = models_dir or self._default_models_dir()
        self.models_dir = Path(self.models_dir)

        self.text_model = None
        self.W: Optional[np.ndarray] = None
        self.b: Optional[np.ndarray] = None
        self.scaler_mean: Optional[np.ndarray] = None
        self.scaler_scale: Optional[np.ndarray] = None
        self.labels = ["non_poster", "poster"]

        self.visual_extractor = VisualFeatureExtractor()
        self.structural_extractor = PDFStructuralExtractor()
        self._initialized = False

    @staticmethod
    def _default_models_dir() -> Path:
        import os
        if env := os.environ.get("POSTER_SENTRY_MODELS_DIR"):
            return Path(env)
        home = Path.home() / ".poster_sentry" / "models"
        home.mkdir(parents=True, exist_ok=True)
        return home

    # ── Initialization ──────────────────────────────────────────

    def initialize(self) -> bool:
        if self._initialized:
            return True
        logger.info("Initializing PosterSentry...")
        t0 = time.time()
        self._load_text_model()
        self._load_head()
        self._initialized = True
        logger.info(f"PosterSentry initialized in {time.time()-t0:.1f}s")
        return True

    def _load_text_model(self):
        from model2vec import StaticModel
        cache = self.models_dir / "poster-sentry-embedding"
        if cache.exists():
            self.text_model = StaticModel.from_pretrained(str(cache))
        else:
            self.text_model = StaticModel.from_pretrained(self.model_name)
            cache.parent.mkdir(parents=True, exist_ok=True)
            self.text_model.save_pretrained(str(cache))

    def _load_head(self):
        path = self.models_dir / "poster_sentry_head.npz"
        if path.exists():
            data = np.load(path, allow_pickle=True)
            self.W = data["W"]
            self.b = data["b"]
            if "labels" in data:
                self.labels = list(data["labels"])
            if "scaler_mean" in data and "scaler_scale" in data:
                self.scaler_mean = data["scaler_mean"]
                self.scaler_scale = data["scaler_scale"]
            logger.info(f"  Loaded classifier head: {path}")
        else:
            logger.warning(f"  Head not found: {path} β€” run training first")

    def save_head(self, path: Optional[Path] = None):
        path = path or (self.models_dir / "poster_sentry_head.npz")
        path.parent.mkdir(parents=True, exist_ok=True)
        np.savez(path, W=self.W, b=self.b, labels=np.array(self.labels))

    # ── Feature extraction ──────────────────────────────────────

    def extract_text(self, pdf_path: str, max_chars: int = 4000) -> str:
        """Extract and clean text from first page of PDF."""
        try:
            import fitz
            doc = fitz.open(pdf_path)
            if len(doc) == 0:
                doc.close()
                return ""
            text = doc[0].get_text()
            doc.close()
            # Basic cleanup
            import re
            text = re.sub(r"\s+", " ", text).strip()
            return text[:max_chars]
        except Exception:
            return ""

    def embed_texts(self, texts: List[str]) -> np.ndarray:
        """Encode texts with model2vec, L2-normalize."""
        embeddings = self.text_model.encode(texts)
        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
        norms = np.where(norms == 0, 1, norms)
        return (embeddings / norms).astype("float32")

    def build_feature_vector(
        self,
        text_emb: np.ndarray,
        visual_feats: np.ndarray,
        structural_feats: np.ndarray,
    ) -> np.ndarray:
        """Concatenate all features: [512 text + 15 visual + 15 structural] = 542."""
        return np.concatenate([text_emb, visual_feats, structural_feats])

    # ── Inference ───────────────────────────────────────────────

    def classify(self, pdf_path: str) -> Dict[str, Any]:
        """Classify a single PDF as poster or non-poster."""
        if not self._initialized:
            self.initialize()
        return self.classify_batch([pdf_path])[0]

    def classify_batch(self, pdf_paths: List[str]) -> List[Dict[str, Any]]:
        """Classify a batch of PDFs."""
        if not self._initialized:
            self.initialize()

        texts = []
        visual_vecs = []
        structural_vecs = []

        for p in pdf_paths:
            texts.append(self.extract_text(p))

            img = self.visual_extractor.pdf_to_image(p)
            if img is not None:
                vf = self.visual_extractor.extract(img)
            else:
                vf = {n: 0.0 for n in self.visual_extractor.FEATURE_NAMES}
            visual_vecs.append(self.visual_extractor.to_vector(vf))

            sf = self.structural_extractor.extract(p)
            structural_vecs.append(self.structural_extractor.to_vector(sf))

        # Embed text
        text_embs = self.embed_texts(texts)
        visual_arr = np.array(visual_vecs, dtype="float32")
        struct_arr = np.array(structural_vecs, dtype="float32")

        # Concatenate
        X = np.concatenate([text_embs, visual_arr, struct_arr], axis=1)

        # Scale features (critical for balanced text vs structural signal)
        if self.scaler_mean is not None and self.scaler_scale is not None:
            X = (X - self.scaler_mean) / np.where(self.scaler_scale == 0, 1, self.scaler_scale)

        # Predict
        if self.W is None:
            return [{"path": p, "is_poster": False, "confidence": 0.0,
                     "error": "Model not trained"} for p in pdf_paths]

        logits = X @ self.W + self.b
        e = np.exp(logits - logits.max(axis=-1, keepdims=True))
        probs = e / e.sum(axis=-1, keepdims=True)

        results = []
        for i, p in enumerate(pdf_paths):
            poster_prob = float(probs[i, 1])
            results.append({
                "path": str(p),
                "is_poster": poster_prob > 0.5,
                "confidence": round(poster_prob, 4),
                "text_score": round(float(probs[i, 1]), 4),
            })
        return results

    # ── Text-only classification (for PubGuard integration) ─────

    def classify_text(self, text: str) -> Dict[str, Any]:
        """Classify from text alone (no PDF needed). Used by PubGuard."""
        return self.classify_texts([text])[0]

    def classify_texts(self, texts: List[str]) -> List[Dict[str, Any]]:
        """Classify from text alone (batch)."""
        if not self._initialized:
            self.initialize()
        if self.W is None:
            return [{"is_poster": False, "confidence": 0.0}] * len(texts)

        text_embs = self.embed_texts(texts)
        # Zero-fill visual and structural features
        zeros_visual = np.zeros((len(texts), N_VISUAL_FEATURES), dtype="float32")
        zeros_struct = np.zeros((len(texts), N_STRUCTURAL_FEATURES), dtype="float32")
        X = np.concatenate([text_embs, zeros_visual, zeros_struct], axis=1)

        # Scale
        if self.scaler_mean is not None and self.scaler_scale is not None:
            X = (X - self.scaler_mean) / np.where(self.scaler_scale == 0, 1, self.scaler_scale)

        logits = X @ self.W + self.b
        e = np.exp(logits - logits.max(axis=-1, keepdims=True))
        probs = e / e.sum(axis=-1, keepdims=True)

        return [{"is_poster": float(probs[i, 1]) > 0.5,
                 "confidence": round(float(probs[i, 1]), 4)}
                for i in range(len(texts))]