fairdataihub
/

poster-sentry

+"""
+Feature extractors for PosterSentry.
+Two feature channels:
+    1. Visual features — image-level statistics (color, edges, FFT, whitespace)
+    2. PDF structural features — page geometry, text blocks, font diversity
+Both are cheap to compute (no GPU needed), providing strong priors that
+complement the text embedding from model2vec.
+"""
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+logger = logging.getLogger(__name__)
+# ── Visual Feature Extractor ────────────────────────────────────
+VISUAL_FEATURE_NAMES = [
+    "img_width",
+    "img_height",
+    "img_aspect_ratio",
+    "mean_r", "mean_g", "mean_b",
+    "std_r", "std_g", "std_b",
+    "local_contrast",
+    "color_diversity",
+    "edge_density",
+    "spatial_complexity",
+    "white_space_ratio",
+    "high_contrast_ratio",
+]
+N_VISUAL_FEATURES = len(VISUAL_FEATURE_NAMES)
+class VisualFeatureExtractor:
+    """Extract visual features from rendered PDF pages."""
+    FEATURE_NAMES = VISUAL_FEATURE_NAMES
+    def __init__(self, target_size: Tuple[int, int] = (256, 256)):
+        self.target_size = target_size
+    def pdf_to_image(self, pdf_path: str, dpi: int = 72) -> Optional[np.ndarray]:
+        """Render first page of PDF to RGB numpy array."""
+        try:
+            import fitz
+            doc = fitz.open(pdf_path)
+            if len(doc) == 0:
+                doc.close()
+                return None
+            page = doc[0]
+            mat = fitz.Matrix(dpi / 72, dpi / 72)
+            pix = page.get_pixmap(matrix=mat)
+            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
+            if pix.n == 4:
+                img = img[:, :, :3]
+            elif pix.n == 1:
+                img = np.stack([img[:, :, 0]] * 3, axis=-1)
+            doc.close()
+            return img
+        except Exception as e:
+            logger.debug(f"PDF to image failed: {e}")
+            return None
+    def extract(self, image: np.ndarray) -> Dict[str, float]:
+        """Extract 15 visual features from an RGB image."""
+        feats = {n: 0.0 for n in self.FEATURE_NAMES}
+        try:
+            from PIL import Image as PILImage
+            h, w = image.shape[:2]
+            feats["img_width"] = float(w)
+            feats["img_height"] = float(h)
+            feats["img_aspect_ratio"] = w / h if h > 0 else 0.0
+            pil = PILImage.fromarray(image).resize(self.target_size, PILImage.Resampling.BILINEAR)
+            resized = np.array(pil)
+            for i, ch in enumerate(["r", "g", "b"]):
+                feats[f"mean_{ch}"] = float(np.mean(resized[:, :, i]))
+                feats[f"std_{ch}"] = float(np.std(resized[:, :, i]))
+            gray = np.mean(resized, axis=2)
+            feats["local_contrast"] = float(np.std(gray))
+            # Color diversity (unique quantized colors in 32x32 thumbnail)
+            small = np.array(pil.resize((32, 32)))
+            quantized = (small // 32).astype(np.uint8)
+            unique_colors = len(np.unique(quantized.reshape(-1, 3), axis=0))
+            feats["color_diversity"] = unique_colors / 512.0
+            # Edge density
+            gy = np.abs(np.diff(gray, axis=0))
+            gx = np.abs(np.diff(gray, axis=1))
+            feats["edge_density"] = float(np.mean(gy) + np.mean(gx)) / 255.0
+            # Spatial complexity (high-freq ratio via FFT)
+            fft = np.fft.fft2(gray)
+            fft_shift = np.fft.fftshift(fft)
+            mag = np.abs(fft_shift)
+            ch, cw = mag.shape[0] // 2, mag.shape[1] // 2
+            radius = min(mag.shape) // 4
+            y, x = np.ogrid[:mag.shape[0], :mag.shape[1]]
+            center_mask = ((y - ch) ** 2 + (x - cw) ** 2) <= radius ** 2
+            total_e = np.sum(mag ** 2)
+            low_e = np.sum(mag[center_mask] ** 2)
+            feats["spatial_complexity"] = 1.0 - (low_e / total_e) if total_e > 0 else 0.0
+            # White space ratio
+            white_px = np.sum(np.all(resized > 240, axis=2))
+            feats["white_space_ratio"] = white_px / (self.target_size[0] * self.target_size[1])
+            # High contrast ratio (very dark + very bright pixels)
+            feats["high_contrast_ratio"] = float(np.sum(gray < 50) + np.sum(gray > 240)) / gray.size
+        except Exception as e:
+            logger.debug(f"Visual feature extraction failed: {e}")
+        return feats
+    def to_vector(self, feats: Dict[str, float]) -> np.ndarray:
+        return np.array([feats.get(n, 0.0) for n in self.FEATURE_NAMES], dtype="float32")
+# ── PDF Structural Feature Extractor ────────────────────────────
+STRUCTURAL_FEATURE_NAMES = [
+    "page_count",
+    "page_width_pt",
+    "page_height_pt",
+    "page_aspect_ratio",
+    "page_area_sqin",
+    "is_landscape",
+    "text_block_count",
+    "font_count",
+    "avg_font_size",
+    "font_size_variance",
+    "title_score",
+    "text_density",
+    "line_count",
+    "file_size_kb",
+    "size_per_page_kb",
+]
+N_STRUCTURAL_FEATURES = len(STRUCTURAL_FEATURE_NAMES)
+class PDFStructuralExtractor:
+    """Extract structural features from PDF layout."""
+    FEATURE_NAMES = STRUCTURAL_FEATURE_NAMES
+    def extract(self, pdf_path: str) -> Dict[str, float]:
+        """Extract 15 structural features from a PDF."""
+        feats = {n: 0.0 for n in self.FEATURE_NAMES}
+        try:
+            import fitz
+            path = Path(pdf_path)
+            doc = fitz.open(str(path))
+            if len(doc) == 0:
+                doc.close()
+                return feats
+            feats["page_count"] = float(len(doc))
+            feats["file_size_kb"] = path.stat().st_size / 1024.0
+            feats["size_per_page_kb"] = feats["file_size_kb"] / max(len(doc), 1)
+            page = doc[0]
+            rect = page.rect
+            feats["page_width_pt"] = rect.width
+            feats["page_height_pt"] = rect.height
+            feats["page_aspect_ratio"] = rect.width / rect.height if rect.height > 0 else 0.0
+            feats["page_area_sqin"] = (rect.width / 72.0) * (rect.height / 72.0)
+            feats["is_landscape"] = float(rect.width > rect.height)
+            # Text blocks
+            blocks = page.get_text("dict")["blocks"]
+            text_blocks = [b for b in blocks if b.get("type") == 0]
+            feats["text_block_count"] = float(len(text_blocks))
+            if text_blocks:
+                heights = [b["bbox"][3] - b["bbox"][1] for b in text_blocks]
+                widths = [b["bbox"][2] - b["bbox"][0] for b in text_blocks]
+                total_area = sum(h * w for h, w in zip(heights, widths))
+                page_area = rect.width * rect.height
+                feats["text_density"] = total_area / page_area if page_area > 0 else 0.0
+            # Font statistics
+            fonts = set()
+            font_sizes = []
+            line_count = 0
+            for block in text_blocks:
+                for line in block.get("lines", []):
+                    line_count += 1
+                    for span in line.get("spans", []):
+                        fonts.add(span.get("font", ""))
+                        sz = span.get("size", 0)
+                        if sz > 0:
+                            font_sizes.append(sz)
+            feats["font_count"] = float(len(fonts))
+            feats["line_count"] = float(line_count)
+            if font_sizes:
+                feats["avg_font_size"] = float(np.mean(font_sizes))
+                feats["font_size_variance"] = float(np.var(font_sizes)) if len(font_sizes) > 1 else 0.0
+                feats["title_score"] = max(font_sizes) / (np.mean(font_sizes) + 1.0)
+            doc.close()
+        except Exception as e:
+            logger.debug(f"PDF structural extraction failed: {e}")
+        return feats
+    def to_vector(self, feats: Dict[str, float]) -> np.ndarray:
+        return np.array([feats.get(n, 0.0) for n in self.FEATURE_NAMES], dtype="float32")