File size: 8,050 Bytes
b5b458d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
"""
Feature extractors for PosterSentry.

Two feature channels:
    1. Visual features β€” image-level statistics (color, edges, FFT, whitespace)
    2. PDF structural features β€” page geometry, text blocks, font diversity

Both are cheap to compute (no GPU needed), providing strong priors that
complement the text embedding from model2vec.
"""

import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np

logger = logging.getLogger(__name__)

# ── Visual Feature Extractor ────────────────────────────────────

VISUAL_FEATURE_NAMES = [
    "img_width",
    "img_height",
    "img_aspect_ratio",
    "mean_r", "mean_g", "mean_b",
    "std_r", "std_g", "std_b",
    "local_contrast",
    "color_diversity",
    "edge_density",
    "spatial_complexity",
    "white_space_ratio",
    "high_contrast_ratio",
]

N_VISUAL_FEATURES = len(VISUAL_FEATURE_NAMES)


class VisualFeatureExtractor:
    """Extract visual features from rendered PDF pages."""

    FEATURE_NAMES = VISUAL_FEATURE_NAMES

    def __init__(self, target_size: Tuple[int, int] = (256, 256)):
        self.target_size = target_size

    def pdf_to_image(self, pdf_path: str, dpi: int = 72) -> Optional[np.ndarray]:
        """Render first page of PDF to RGB numpy array."""
        try:
            import fitz
            doc = fitz.open(pdf_path)
            if len(doc) == 0:
                doc.close()
                return None
            page = doc[0]
            mat = fitz.Matrix(dpi / 72, dpi / 72)
            pix = page.get_pixmap(matrix=mat)
            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
            if pix.n == 4:
                img = img[:, :, :3]
            elif pix.n == 1:
                img = np.stack([img[:, :, 0]] * 3, axis=-1)
            doc.close()
            return img
        except Exception as e:
            logger.debug(f"PDF to image failed: {e}")
            return None

    def extract(self, image: np.ndarray) -> Dict[str, float]:
        """Extract 15 visual features from an RGB image."""
        feats = {n: 0.0 for n in self.FEATURE_NAMES}
        try:
            from PIL import Image as PILImage

            h, w = image.shape[:2]
            feats["img_width"] = float(w)
            feats["img_height"] = float(h)
            feats["img_aspect_ratio"] = w / h if h > 0 else 0.0

            pil = PILImage.fromarray(image).resize(self.target_size, PILImage.Resampling.BILINEAR)
            resized = np.array(pil)

            for i, ch in enumerate(["r", "g", "b"]):
                feats[f"mean_{ch}"] = float(np.mean(resized[:, :, i]))
                feats[f"std_{ch}"] = float(np.std(resized[:, :, i]))

            gray = np.mean(resized, axis=2)
            feats["local_contrast"] = float(np.std(gray))

            # Color diversity (unique quantized colors in 32x32 thumbnail)
            small = np.array(pil.resize((32, 32)))
            quantized = (small // 32).astype(np.uint8)
            unique_colors = len(np.unique(quantized.reshape(-1, 3), axis=0))
            feats["color_diversity"] = unique_colors / 512.0

            # Edge density
            gy = np.abs(np.diff(gray, axis=0))
            gx = np.abs(np.diff(gray, axis=1))
            feats["edge_density"] = float(np.mean(gy) + np.mean(gx)) / 255.0

            # Spatial complexity (high-freq ratio via FFT)
            fft = np.fft.fft2(gray)
            fft_shift = np.fft.fftshift(fft)
            mag = np.abs(fft_shift)
            ch, cw = mag.shape[0] // 2, mag.shape[1] // 2
            radius = min(mag.shape) // 4
            y, x = np.ogrid[:mag.shape[0], :mag.shape[1]]
            center_mask = ((y - ch) ** 2 + (x - cw) ** 2) <= radius ** 2
            total_e = np.sum(mag ** 2)
            low_e = np.sum(mag[center_mask] ** 2)
            feats["spatial_complexity"] = 1.0 - (low_e / total_e) if total_e > 0 else 0.0

            # White space ratio
            white_px = np.sum(np.all(resized > 240, axis=2))
            feats["white_space_ratio"] = white_px / (self.target_size[0] * self.target_size[1])

            # High contrast ratio (very dark + very bright pixels)
            feats["high_contrast_ratio"] = float(np.sum(gray < 50) + np.sum(gray > 240)) / gray.size

        except Exception as e:
            logger.debug(f"Visual feature extraction failed: {e}")
        return feats

    def to_vector(self, feats: Dict[str, float]) -> np.ndarray:
        return np.array([feats.get(n, 0.0) for n in self.FEATURE_NAMES], dtype="float32")


# ── PDF Structural Feature Extractor ────────────────────────────

STRUCTURAL_FEATURE_NAMES = [
    "page_count",
    "page_width_pt",
    "page_height_pt",
    "page_aspect_ratio",
    "page_area_sqin",
    "is_landscape",
    "text_block_count",
    "font_count",
    "avg_font_size",
    "font_size_variance",
    "title_score",
    "text_density",
    "line_count",
    "file_size_kb",
    "size_per_page_kb",
]

N_STRUCTURAL_FEATURES = len(STRUCTURAL_FEATURE_NAMES)


class PDFStructuralExtractor:
    """Extract structural features from PDF layout."""

    FEATURE_NAMES = STRUCTURAL_FEATURE_NAMES

    def extract(self, pdf_path: str) -> Dict[str, float]:
        """Extract 15 structural features from a PDF."""
        feats = {n: 0.0 for n in self.FEATURE_NAMES}
        try:
            import fitz
            path = Path(pdf_path)
            doc = fitz.open(str(path))
            if len(doc) == 0:
                doc.close()
                return feats

            feats["page_count"] = float(len(doc))
            feats["file_size_kb"] = path.stat().st_size / 1024.0
            feats["size_per_page_kb"] = feats["file_size_kb"] / max(len(doc), 1)

            page = doc[0]
            rect = page.rect
            feats["page_width_pt"] = rect.width
            feats["page_height_pt"] = rect.height
            feats["page_aspect_ratio"] = rect.width / rect.height if rect.height > 0 else 0.0
            feats["page_area_sqin"] = (rect.width / 72.0) * (rect.height / 72.0)
            feats["is_landscape"] = float(rect.width > rect.height)

            # Text blocks
            blocks = page.get_text("dict")["blocks"]
            text_blocks = [b for b in blocks if b.get("type") == 0]
            feats["text_block_count"] = float(len(text_blocks))

            if text_blocks:
                heights = [b["bbox"][3] - b["bbox"][1] for b in text_blocks]
                widths = [b["bbox"][2] - b["bbox"][0] for b in text_blocks]
                total_area = sum(h * w for h, w in zip(heights, widths))
                page_area = rect.width * rect.height
                feats["text_density"] = total_area / page_area if page_area > 0 else 0.0

            # Font statistics
            fonts = set()
            font_sizes = []
            line_count = 0
            for block in text_blocks:
                for line in block.get("lines", []):
                    line_count += 1
                    for span in line.get("spans", []):
                        fonts.add(span.get("font", ""))
                        sz = span.get("size", 0)
                        if sz > 0:
                            font_sizes.append(sz)

            feats["font_count"] = float(len(fonts))
            feats["line_count"] = float(line_count)
            if font_sizes:
                feats["avg_font_size"] = float(np.mean(font_sizes))
                feats["font_size_variance"] = float(np.var(font_sizes)) if len(font_sizes) > 1 else 0.0
                feats["title_score"] = max(font_sizes) / (np.mean(font_sizes) + 1.0)

            doc.close()
        except Exception as e:
            logger.debug(f"PDF structural extraction failed: {e}")
        return feats

    def to_vector(self, feats: Dict[str, float]) -> np.ndarray:
        return np.array([feats.get(n, 0.0) for n in self.FEATURE_NAMES], dtype="float32")