""" Feature extractors for PosterSentry. Two feature channels: 1. Visual features — image-level statistics (color, edges, FFT, whitespace) 2. PDF structural features — page geometry, text blocks, font diversity Both are cheap to compute (no GPU needed), providing strong priors that complement the text embedding from model2vec. """ import logging from pathlib import Path from typing import Dict, List, Optional, Tuple import numpy as np logger = logging.getLogger(__name__) # ── Visual Feature Extractor ──────────────────────────────────── VISUAL_FEATURE_NAMES = [ "img_width", "img_height", "img_aspect_ratio", "mean_r", "mean_g", "mean_b", "std_r", "std_g", "std_b", "local_contrast", "color_diversity", "edge_density", "spatial_complexity", "white_space_ratio", "high_contrast_ratio", ] N_VISUAL_FEATURES = len(VISUAL_FEATURE_NAMES) class VisualFeatureExtractor: """Extract visual features from rendered PDF pages.""" FEATURE_NAMES = VISUAL_FEATURE_NAMES def __init__(self, target_size: Tuple[int, int] = (256, 256)): self.target_size = target_size def pdf_to_image(self, pdf_path: str, dpi: int = 72) -> Optional[np.ndarray]: """Render first page of PDF to RGB numpy array.""" try: import fitz doc = fitz.open(pdf_path) if len(doc) == 0: doc.close() return None page = doc[0] mat = fitz.Matrix(dpi / 72, dpi / 72) pix = page.get_pixmap(matrix=mat) img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n) if pix.n == 4: img = img[:, :, :3] elif pix.n == 1: img = np.stack([img[:, :, 0]] * 3, axis=-1) doc.close() return img except Exception as e: logger.debug(f"PDF to image failed: {e}") return None def extract(self, image: np.ndarray) -> Dict[str, float]: """Extract 15 visual features from an RGB image.""" feats = {n: 0.0 for n in self.FEATURE_NAMES} try: from PIL import Image as PILImage h, w = image.shape[:2] feats["img_width"] = float(w) feats["img_height"] = float(h) feats["img_aspect_ratio"] = w / h if h > 0 else 0.0 pil = PILImage.fromarray(image).resize(self.target_size, PILImage.Resampling.BILINEAR) resized = np.array(pil) for i, ch in enumerate(["r", "g", "b"]): feats[f"mean_{ch}"] = float(np.mean(resized[:, :, i])) feats[f"std_{ch}"] = float(np.std(resized[:, :, i])) gray = np.mean(resized, axis=2) feats["local_contrast"] = float(np.std(gray)) # Color diversity (unique quantized colors in 32x32 thumbnail) small = np.array(pil.resize((32, 32))) quantized = (small // 32).astype(np.uint8) unique_colors = len(np.unique(quantized.reshape(-1, 3), axis=0)) feats["color_diversity"] = unique_colors / 512.0 # Edge density gy = np.abs(np.diff(gray, axis=0)) gx = np.abs(np.diff(gray, axis=1)) feats["edge_density"] = float(np.mean(gy) + np.mean(gx)) / 255.0 # Spatial complexity (high-freq ratio via FFT) fft = np.fft.fft2(gray) fft_shift = np.fft.fftshift(fft) mag = np.abs(fft_shift) ch, cw = mag.shape[0] // 2, mag.shape[1] // 2 radius = min(mag.shape) // 4 y, x = np.ogrid[:mag.shape[0], :mag.shape[1]] center_mask = ((y - ch) ** 2 + (x - cw) ** 2) <= radius ** 2 total_e = np.sum(mag ** 2) low_e = np.sum(mag[center_mask] ** 2) feats["spatial_complexity"] = 1.0 - (low_e / total_e) if total_e > 0 else 0.0 # White space ratio white_px = np.sum(np.all(resized > 240, axis=2)) feats["white_space_ratio"] = white_px / (self.target_size[0] * self.target_size[1]) # High contrast ratio (very dark + very bright pixels) feats["high_contrast_ratio"] = float(np.sum(gray < 50) + np.sum(gray > 240)) / gray.size except Exception as e: logger.debug(f"Visual feature extraction failed: {e}") return feats def to_vector(self, feats: Dict[str, float]) -> np.ndarray: return np.array([feats.get(n, 0.0) for n in self.FEATURE_NAMES], dtype="float32") # ── PDF Structural Feature Extractor ──────────────────────────── STRUCTURAL_FEATURE_NAMES = [ "page_count", "page_width_pt", "page_height_pt", "page_aspect_ratio", "page_area_sqin", "is_landscape", "text_block_count", "font_count", "avg_font_size", "font_size_variance", "title_score", "text_density", "line_count", "file_size_kb", "size_per_page_kb", ] N_STRUCTURAL_FEATURES = len(STRUCTURAL_FEATURE_NAMES) class PDFStructuralExtractor: """Extract structural features from PDF layout.""" FEATURE_NAMES = STRUCTURAL_FEATURE_NAMES def extract(self, pdf_path: str) -> Dict[str, float]: """Extract 15 structural features from a PDF.""" feats = {n: 0.0 for n in self.FEATURE_NAMES} try: import fitz path = Path(pdf_path) doc = fitz.open(str(path)) if len(doc) == 0: doc.close() return feats feats["page_count"] = float(len(doc)) feats["file_size_kb"] = path.stat().st_size / 1024.0 feats["size_per_page_kb"] = feats["file_size_kb"] / max(len(doc), 1) page = doc[0] rect = page.rect feats["page_width_pt"] = rect.width feats["page_height_pt"] = rect.height feats["page_aspect_ratio"] = rect.width / rect.height if rect.height > 0 else 0.0 feats["page_area_sqin"] = (rect.width / 72.0) * (rect.height / 72.0) feats["is_landscape"] = float(rect.width > rect.height) # Text blocks blocks = page.get_text("dict")["blocks"] text_blocks = [b for b in blocks if b.get("type") == 0] feats["text_block_count"] = float(len(text_blocks)) if text_blocks: heights = [b["bbox"][3] - b["bbox"][1] for b in text_blocks] widths = [b["bbox"][2] - b["bbox"][0] for b in text_blocks] total_area = sum(h * w for h, w in zip(heights, widths)) page_area = rect.width * rect.height feats["text_density"] = total_area / page_area if page_area > 0 else 0.0 # Font statistics fonts = set() font_sizes = [] line_count = 0 for block in text_blocks: for line in block.get("lines", []): line_count += 1 for span in line.get("spans", []): fonts.add(span.get("font", "")) sz = span.get("size", 0) if sz > 0: font_sizes.append(sz) feats["font_count"] = float(len(fonts)) feats["line_count"] = float(line_count) if font_sizes: feats["avg_font_size"] = float(np.mean(font_sizes)) feats["font_size_variance"] = float(np.var(font_sizes)) if len(font_sizes) > 1 else 0.0 feats["title_score"] = max(font_sizes) / (np.mean(font_sizes) + 1.0) doc.close() except Exception as e: logger.debug(f"PDF structural extraction failed: {e}") return feats def to_vector(self, feats: Dict[str, float]) -> np.ndarray: return np.array([feats.get(n, 0.0) for n in self.FEATURE_NAMES], dtype="float32")