jimnoneill commited on
Commit
b5b458d
Β·
verified Β·
1 Parent(s): ff83f62

Upload src/poster_sentry/features.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/poster_sentry/features.py +217 -0
src/poster_sentry/features.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Feature extractors for PosterSentry.
3
+
4
+ Two feature channels:
5
+ 1. Visual features β€” image-level statistics (color, edges, FFT, whitespace)
6
+ 2. PDF structural features β€” page geometry, text blocks, font diversity
7
+
8
+ Both are cheap to compute (no GPU needed), providing strong priors that
9
+ complement the text embedding from model2vec.
10
+ """
11
+
12
+ import logging
13
+ from pathlib import Path
14
+ from typing import Dict, List, Optional, Tuple
15
+
16
+ import numpy as np
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # ── Visual Feature Extractor ────────────────────────────────────
21
+
22
+ VISUAL_FEATURE_NAMES = [
23
+ "img_width",
24
+ "img_height",
25
+ "img_aspect_ratio",
26
+ "mean_r", "mean_g", "mean_b",
27
+ "std_r", "std_g", "std_b",
28
+ "local_contrast",
29
+ "color_diversity",
30
+ "edge_density",
31
+ "spatial_complexity",
32
+ "white_space_ratio",
33
+ "high_contrast_ratio",
34
+ ]
35
+
36
+ N_VISUAL_FEATURES = len(VISUAL_FEATURE_NAMES)
37
+
38
+
39
+ class VisualFeatureExtractor:
40
+ """Extract visual features from rendered PDF pages."""
41
+
42
+ FEATURE_NAMES = VISUAL_FEATURE_NAMES
43
+
44
+ def __init__(self, target_size: Tuple[int, int] = (256, 256)):
45
+ self.target_size = target_size
46
+
47
+ def pdf_to_image(self, pdf_path: str, dpi: int = 72) -> Optional[np.ndarray]:
48
+ """Render first page of PDF to RGB numpy array."""
49
+ try:
50
+ import fitz
51
+ doc = fitz.open(pdf_path)
52
+ if len(doc) == 0:
53
+ doc.close()
54
+ return None
55
+ page = doc[0]
56
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
57
+ pix = page.get_pixmap(matrix=mat)
58
+ img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
59
+ if pix.n == 4:
60
+ img = img[:, :, :3]
61
+ elif pix.n == 1:
62
+ img = np.stack([img[:, :, 0]] * 3, axis=-1)
63
+ doc.close()
64
+ return img
65
+ except Exception as e:
66
+ logger.debug(f"PDF to image failed: {e}")
67
+ return None
68
+
69
+ def extract(self, image: np.ndarray) -> Dict[str, float]:
70
+ """Extract 15 visual features from an RGB image."""
71
+ feats = {n: 0.0 for n in self.FEATURE_NAMES}
72
+ try:
73
+ from PIL import Image as PILImage
74
+
75
+ h, w = image.shape[:2]
76
+ feats["img_width"] = float(w)
77
+ feats["img_height"] = float(h)
78
+ feats["img_aspect_ratio"] = w / h if h > 0 else 0.0
79
+
80
+ pil = PILImage.fromarray(image).resize(self.target_size, PILImage.Resampling.BILINEAR)
81
+ resized = np.array(pil)
82
+
83
+ for i, ch in enumerate(["r", "g", "b"]):
84
+ feats[f"mean_{ch}"] = float(np.mean(resized[:, :, i]))
85
+ feats[f"std_{ch}"] = float(np.std(resized[:, :, i]))
86
+
87
+ gray = np.mean(resized, axis=2)
88
+ feats["local_contrast"] = float(np.std(gray))
89
+
90
+ # Color diversity (unique quantized colors in 32x32 thumbnail)
91
+ small = np.array(pil.resize((32, 32)))
92
+ quantized = (small // 32).astype(np.uint8)
93
+ unique_colors = len(np.unique(quantized.reshape(-1, 3), axis=0))
94
+ feats["color_diversity"] = unique_colors / 512.0
95
+
96
+ # Edge density
97
+ gy = np.abs(np.diff(gray, axis=0))
98
+ gx = np.abs(np.diff(gray, axis=1))
99
+ feats["edge_density"] = float(np.mean(gy) + np.mean(gx)) / 255.0
100
+
101
+ # Spatial complexity (high-freq ratio via FFT)
102
+ fft = np.fft.fft2(gray)
103
+ fft_shift = np.fft.fftshift(fft)
104
+ mag = np.abs(fft_shift)
105
+ ch, cw = mag.shape[0] // 2, mag.shape[1] // 2
106
+ radius = min(mag.shape) // 4
107
+ y, x = np.ogrid[:mag.shape[0], :mag.shape[1]]
108
+ center_mask = ((y - ch) ** 2 + (x - cw) ** 2) <= radius ** 2
109
+ total_e = np.sum(mag ** 2)
110
+ low_e = np.sum(mag[center_mask] ** 2)
111
+ feats["spatial_complexity"] = 1.0 - (low_e / total_e) if total_e > 0 else 0.0
112
+
113
+ # White space ratio
114
+ white_px = np.sum(np.all(resized > 240, axis=2))
115
+ feats["white_space_ratio"] = white_px / (self.target_size[0] * self.target_size[1])
116
+
117
+ # High contrast ratio (very dark + very bright pixels)
118
+ feats["high_contrast_ratio"] = float(np.sum(gray < 50) + np.sum(gray > 240)) / gray.size
119
+
120
+ except Exception as e:
121
+ logger.debug(f"Visual feature extraction failed: {e}")
122
+ return feats
123
+
124
+ def to_vector(self, feats: Dict[str, float]) -> np.ndarray:
125
+ return np.array([feats.get(n, 0.0) for n in self.FEATURE_NAMES], dtype="float32")
126
+
127
+
128
+ # ── PDF Structural Feature Extractor ────────────────────────────
129
+
130
+ STRUCTURAL_FEATURE_NAMES = [
131
+ "page_count",
132
+ "page_width_pt",
133
+ "page_height_pt",
134
+ "page_aspect_ratio",
135
+ "page_area_sqin",
136
+ "is_landscape",
137
+ "text_block_count",
138
+ "font_count",
139
+ "avg_font_size",
140
+ "font_size_variance",
141
+ "title_score",
142
+ "text_density",
143
+ "line_count",
144
+ "file_size_kb",
145
+ "size_per_page_kb",
146
+ ]
147
+
148
+ N_STRUCTURAL_FEATURES = len(STRUCTURAL_FEATURE_NAMES)
149
+
150
+
151
+ class PDFStructuralExtractor:
152
+ """Extract structural features from PDF layout."""
153
+
154
+ FEATURE_NAMES = STRUCTURAL_FEATURE_NAMES
155
+
156
+ def extract(self, pdf_path: str) -> Dict[str, float]:
157
+ """Extract 15 structural features from a PDF."""
158
+ feats = {n: 0.0 for n in self.FEATURE_NAMES}
159
+ try:
160
+ import fitz
161
+ path = Path(pdf_path)
162
+ doc = fitz.open(str(path))
163
+ if len(doc) == 0:
164
+ doc.close()
165
+ return feats
166
+
167
+ feats["page_count"] = float(len(doc))
168
+ feats["file_size_kb"] = path.stat().st_size / 1024.0
169
+ feats["size_per_page_kb"] = feats["file_size_kb"] / max(len(doc), 1)
170
+
171
+ page = doc[0]
172
+ rect = page.rect
173
+ feats["page_width_pt"] = rect.width
174
+ feats["page_height_pt"] = rect.height
175
+ feats["page_aspect_ratio"] = rect.width / rect.height if rect.height > 0 else 0.0
176
+ feats["page_area_sqin"] = (rect.width / 72.0) * (rect.height / 72.0)
177
+ feats["is_landscape"] = float(rect.width > rect.height)
178
+
179
+ # Text blocks
180
+ blocks = page.get_text("dict")["blocks"]
181
+ text_blocks = [b for b in blocks if b.get("type") == 0]
182
+ feats["text_block_count"] = float(len(text_blocks))
183
+
184
+ if text_blocks:
185
+ heights = [b["bbox"][3] - b["bbox"][1] for b in text_blocks]
186
+ widths = [b["bbox"][2] - b["bbox"][0] for b in text_blocks]
187
+ total_area = sum(h * w for h, w in zip(heights, widths))
188
+ page_area = rect.width * rect.height
189
+ feats["text_density"] = total_area / page_area if page_area > 0 else 0.0
190
+
191
+ # Font statistics
192
+ fonts = set()
193
+ font_sizes = []
194
+ line_count = 0
195
+ for block in text_blocks:
196
+ for line in block.get("lines", []):
197
+ line_count += 1
198
+ for span in line.get("spans", []):
199
+ fonts.add(span.get("font", ""))
200
+ sz = span.get("size", 0)
201
+ if sz > 0:
202
+ font_sizes.append(sz)
203
+
204
+ feats["font_count"] = float(len(fonts))
205
+ feats["line_count"] = float(line_count)
206
+ if font_sizes:
207
+ feats["avg_font_size"] = float(np.mean(font_sizes))
208
+ feats["font_size_variance"] = float(np.var(font_sizes)) if len(font_sizes) > 1 else 0.0
209
+ feats["title_score"] = max(font_sizes) / (np.mean(font_sizes) + 1.0)
210
+
211
+ doc.close()
212
+ except Exception as e:
213
+ logger.debug(f"PDF structural extraction failed: {e}")
214
+ return feats
215
+
216
+ def to_vector(self, feats: Dict[str, float]) -> np.ndarray:
217
+ return np.array([feats.get(n, 0.0) for n in self.FEATURE_NAMES], dtype="float32")