| | import fitz
|
| | import pytesseract
|
| | from PIL import Image
|
| | import numpy as np
|
| | import cv2
|
| | from collections import defaultdict, Counter
|
| | import io
|
| | import re
|
| | from typing import Dict, List, Tuple, Optional, Union
|
| |
|
| |
|
| | class PDFArtworkMetadataExtractor:
|
| | """
|
| | A class for extracting metadata (font, font size, text color) from artwork PDFs.
|
| | Handles both selectable text and non-selectable text using OCR.
|
| | """
|
| |
|
| | def __init__(self, tesseract_path: Optional[str] = None):
|
| | """
|
| | Initialize the metadata extractor.
|
| |
|
| | Args:
|
| | tesseract_path: Path to tesseract executable (if not in PATH)
|
| | """
|
| | if tesseract_path:
|
| | pytesseract.pytesseract.tesseract_cmd = tesseract_path
|
| |
|
| | self.pdf_doc = None
|
| | self.metadata = {
|
| | 'fonts': {},
|
| | 'font_sizes': {},
|
| | 'text_colors': {},
|
| | 'has_selectable_text': False,
|
| | 'pages_processed': 0,
|
| | 'extraction_method': None
|
| | }
|
| |
|
| | def load_pdf(self, pdf_path: str) -> bool:
|
| | """
|
| | Load PDF document.
|
| |
|
| | Args:
|
| | pdf_path: Path to PDF file
|
| |
|
| | Returns:
|
| | bool: True if successful, False otherwise
|
| | """
|
| | try:
|
| | self.pdf_doc = fitz.open(pdf_path)
|
| | return True
|
| | except Exception as e:
|
| | print(f"Error loading PDF: {e}")
|
| | return False
|
| |
|
| | def _extract_selectable_text_metadata(self) -> Dict:
|
| | """
|
| | Extract metadata from selectable text using PyMuPDF.
|
| |
|
| | Returns:
|
| | Dict: Metadata dictionary with fonts, sizes, and colors
|
| | """
|
| | fonts = defaultdict(int)
|
| | font_sizes = defaultdict(int)
|
| | colors = defaultdict(int)
|
| |
|
| | for page_num in range(len(self.pdf_doc)):
|
| | page = self.pdf_doc[page_num]
|
| |
|
| |
|
| | text_dict = page.get_text("dict")
|
| |
|
| | for block in text_dict["blocks"]:
|
| | if "lines" in block:
|
| | for line in block["lines"]:
|
| | for span in line["spans"]:
|
| |
|
| | font_name = span.get("font", "Unknown")
|
| | font_size = span.get("size", 0)
|
| |
|
| |
|
| | color = span.get("color", 0)
|
| | if isinstance(color, int):
|
| |
|
| | r = (color >> 16) & 255
|
| | g = (color >> 8) & 255
|
| | b = color & 255
|
| | color_rgb = (r, g, b)
|
| | else:
|
| | color_rgb = (0, 0, 0)
|
| |
|
| |
|
| | text_content = span.get("text", "").strip()
|
| | if text_content:
|
| | fonts[font_name] += len(text_content)
|
| |
|
| | rounded_size = round(font_size, 1)
|
| | font_sizes[rounded_size] += len(text_content)
|
| | colors[color_rgb] += len(text_content)
|
| |
|
| | return {
|
| | 'fonts': dict(fonts),
|
| | 'font_sizes': dict(font_sizes),
|
| | 'text_colors': dict(colors)
|
| | }
|
| |
|
| | def _preprocess_image_for_ocr(self, image: np.ndarray) -> np.ndarray:
|
| | """
|
| | Preprocess image for better OCR results.
|
| |
|
| | Args:
|
| | image: Input image as numpy array
|
| |
|
| | Returns:
|
| | np.ndarray: Preprocessed image
|
| | """
|
| |
|
| | if len(image.shape) == 3:
|
| | gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
|
| | else:
|
| | gray = image
|
| |
|
| |
|
| | denoised = cv2.fastNlMeansDenoising(gray)
|
| |
|
| |
|
| | thresh = cv2.adaptiveThreshold(
|
| | denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| | cv2.THRESH_BINARY, 11, 2
|
| | )
|
| |
|
| | return thresh
|
| |
|
| | def _estimate_font_size_from_ocr(self, image: np.ndarray, text_data: Dict) -> Dict[float, int]:
|
| | """
|
| | Estimate font sizes from OCR bounding boxes.
|
| |
|
| | Args:
|
| | image: Input image
|
| | text_data: OCR data from pytesseract
|
| |
|
| | Returns:
|
| | Dict: Font sizes and their frequencies
|
| | """
|
| | font_sizes = defaultdict(int)
|
| |
|
| | for i, text in enumerate(text_data['text']):
|
| | if text.strip():
|
| | height = text_data['height'][i]
|
| |
|
| | estimated_size = max(8, min(72, height * 0.75))
|
| |
|
| | rounded_size = round(estimated_size, 1)
|
| | font_sizes[rounded_size] += len(text.strip())
|
| |
|
| | return dict(font_sizes)
|
| |
|
| | def _extract_colors_from_image(self, image: np.ndarray, text_data: Dict) -> Dict[Tuple[int, int, int], int]:
|
| | """
|
| | Extract dominant colors from text regions.
|
| |
|
| | Args:
|
| | image: Input image
|
| | text_data: OCR data from pytesseract
|
| |
|
| | Returns:
|
| | Dict: Colors and their frequencies
|
| | """
|
| | colors = defaultdict(int)
|
| |
|
| | for i, text in enumerate(text_data['text']):
|
| | if text.strip():
|
| | x, y, w, h = (text_data['left'][i], text_data['top'][i],
|
| | text_data['width'][i], text_data['height'][i])
|
| |
|
| |
|
| | if 0 <= y < image.shape[0] and 0 <= x < image.shape[1]:
|
| | text_region = image[y:y+h, x:x+w]
|
| |
|
| | if text_region.size > 0:
|
| | if len(text_region.shape) == 3:
|
| |
|
| | pixels = text_region.reshape(-1, 3)
|
| |
|
| | unique_colors, counts = np.unique(pixels, axis=0, return_counts=True)
|
| |
|
| |
|
| | for color, count in zip(unique_colors, counts):
|
| | if np.mean(color) < 200:
|
| | colors[tuple(color)] += len(text.strip())
|
| | else:
|
| |
|
| | avg_intensity = np.mean(text_region)
|
| | if avg_intensity < 128:
|
| | colors[(0, 0, 0)] += len(text.strip())
|
| |
|
| | return dict(colors)
|
| |
|
| | def _extract_ocr_metadata(self) -> Dict:
|
| | """
|
| | Extract metadata using OCR for non-selectable text.
|
| |
|
| | Returns:
|
| | Dict: Metadata dictionary with estimated fonts, sizes, and colors
|
| | """
|
| | all_font_sizes = defaultdict(int)
|
| | all_colors = defaultdict(int)
|
| |
|
| | for page_num in range(len(self.pdf_doc)):
|
| | page = self.pdf_doc[page_num]
|
| |
|
| |
|
| | pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
| | img_data = pix.tobytes("ppm")
|
| | image = Image.open(io.BytesIO(img_data))
|
| | image_np = np.array(image)
|
| |
|
| |
|
| | processed_img = self._preprocess_image_for_ocr(image_np)
|
| |
|
| |
|
| | ocr_data = pytesseract.image_to_data(processed_img, output_type=pytesseract.Output.DICT)
|
| |
|
| |
|
| | page_font_sizes = self._estimate_font_size_from_ocr(processed_img, ocr_data)
|
| | for size, count in page_font_sizes.items():
|
| | all_font_sizes[size] += count
|
| |
|
| |
|
| | page_colors = self._extract_colors_from_image(image_np, ocr_data)
|
| | for color, count in page_colors.items():
|
| | all_colors[color] += count
|
| |
|
| |
|
| | estimated_fonts = {
|
| | 'Arial-like': sum(all_font_sizes.values()) * 0.4,
|
| | 'Times-like': sum(all_font_sizes.values()) * 0.3,
|
| | 'Helvetica-like': sum(all_font_sizes.values()) * 0.3
|
| | }
|
| |
|
| | return {
|
| | 'fonts': estimated_fonts,
|
| | 'font_sizes': dict(all_font_sizes),
|
| | 'text_colors': dict(all_colors)
|
| | }
|
| |
|
| | def _has_selectable_text(self) -> bool:
|
| | """
|
| | Check if PDF has selectable text.
|
| |
|
| | Returns:
|
| | bool: True if PDF has selectable text
|
| | """
|
| | for page_num in range(min(3, len(self.pdf_doc))):
|
| | page = self.pdf_doc[page_num]
|
| | text = page.get_text().strip()
|
| | if text:
|
| | return True
|
| | return False
|
| |
|
| | def extract_metadata(self, pdf_path: str) -> Dict:
|
| | """
|
| | Extract metadata from PDF artwork.
|
| |
|
| | Args:
|
| | pdf_path: Path to PDF file
|
| |
|
| | Returns:
|
| | Dict: Complete metadata dictionary
|
| | """
|
| | if not self.load_pdf(pdf_path):
|
| | return {'error': 'Failed to load PDF'}
|
| |
|
| | try:
|
| | self.metadata['pages_processed'] = len(self.pdf_doc)
|
| | has_selectable = self._has_selectable_text()
|
| | self.metadata['has_selectable_text'] = has_selectable
|
| |
|
| | if has_selectable:
|
| | self.metadata['extraction_method'] = 'selectable_text'
|
| | extracted_data = self._extract_selectable_text_metadata()
|
| | else:
|
| | self.metadata['extraction_method'] = 'ocr'
|
| | extracted_data = self._extract_ocr_metadata()
|
| |
|
| |
|
| | self.metadata.update(extracted_data)
|
| |
|
| |
|
| | self.metadata['fonts'] = dict(sorted(
|
| | self.metadata['fonts'].items(),
|
| | key=lambda x: x[1],
|
| | reverse=True
|
| | ))
|
| |
|
| | self.metadata['font_sizes'] = dict(sorted(
|
| | self.metadata['font_sizes'].items(),
|
| | key=lambda x: x[1],
|
| | reverse=True
|
| | ))
|
| |
|
| | self.metadata['text_colors'] = dict(sorted(
|
| | self.metadata['text_colors'].items(),
|
| | key=lambda x: x[1],
|
| | reverse=True
|
| | ))
|
| |
|
| | return self.metadata
|
| |
|
| | except Exception as e:
|
| | return {'error': f'Failed to extract metadata: {e}'}
|
| |
|
| | finally:
|
| | if self.pdf_doc:
|
| | self.pdf_doc.close()
|
| |
|
| | def get_dominant_font(self) -> Optional[str]:
|
| | """Get the most frequently used font."""
|
| | if self.metadata['fonts']:
|
| | return max(self.metadata['fonts'], key=self.metadata['fonts'].get)
|
| | return None
|
| |
|
| | def get_dominant_font_size(self) -> Optional[float]:
|
| | """Get the most frequently used font size."""
|
| | if self.metadata['font_sizes']:
|
| | return max(self.metadata['font_sizes'], key=self.metadata['font_sizes'].get)
|
| | return None
|
| |
|
| | def get_dominant_color(self) -> Optional[Tuple[int, int, int]]:
|
| | """Get the most frequently used text color."""
|
| | if self.metadata['text_colors']:
|
| | return max(self.metadata['text_colors'], key=self.metadata['text_colors'].get)
|
| | return None
|
| |
|
| | def print_summary(self):
|
| | """Print a summary of extracted metadata."""
|
| | print("PDF Artwork Metadata Summary")
|
| | print("=" * 40)
|
| | print(f"Pages processed: {self.metadata['pages_processed']}")
|
| | print(f"Has selectable text: {self.metadata['has_selectable_text']}")
|
| | print(f"Extraction method: {self.metadata['extraction_method']}")
|
| | print()
|
| |
|
| | print("Top 5 Fonts:")
|
| | for i, (font, count) in enumerate(list(self.metadata['fonts'].items())[:5]):
|
| | print(f" {i+1}. {font}: {count} characters")
|
| | print()
|
| |
|
| | print("Top 5 Font Sizes:")
|
| | for i, (size, count) in enumerate(list(self.metadata['font_sizes'].items())[:5]):
|
| | print(f" {i+1}. {size}pt: {count} characters")
|
| | print()
|
| |
|
| | print("Top 5 Text Colors (RGB):")
|
| | for i, (color, count) in enumerate(list(self.metadata['text_colors'].items())[:5]):
|
| | print(f" {i+1}. {color}: {count} characters") |