Spaces:

rianders
/

pdfinspector

Sleeping

App Files Files Community

pdfinspector / layout_utils.py

rianders

Fix file load errors and implement auto-refresh functionality

0d61aa0 about 2 months ago

raw

history blame contribute delete

5.87 kB

	"""
	Layout Utilities Module

	Contains shared logic for block extraction, ordering, and data structures
	to avoid circular dependencies between app.py and other modules.
	"""

	from dataclasses import dataclass
	from typing import List, Tuple, Any, Dict, Optional
	import pymupdf as fitz
	import re

	@dataclass
	class SpanInfo:
	bbox: Tuple[float, float, float, float]
	text: str
	font: str
	size: float

	@dataclass
	class BlockInfo:
	bbox: Tuple[float, float, float, float]
	text: str
	block_type: int # 0 text, 1 image, 2 drawing in PyMuPDF terms for some outputs
	spans: List[SpanInfo]

	@dataclass
	class PageDiagnostic:
	"""Extended diagnostic for batch processing."""
	page_num: int
	tagged_pdf: bool
	text_len: int
	image_block_count: int
	font_count: int
	has_type3_fonts: bool
	suspicious_garbled_text: bool
	likely_scanned_image_page: bool
	likely_text_as_vector_outlines: bool
	multi_column_guess: bool
	processing_time_ms: Optional[int] = None

	@dataclass
	class BatchAnalysisResult:
	"""Aggregate results from all pages."""
	total_pages: int
	pages_analyzed: int
	summary_stats: Dict[str, int]
	per_page_results: List[PageDiagnostic]
	common_issues: List[str]
	critical_pages: List[int]
	processing_time_sec: float

	def to_dict(self) -> Dict[str, Any]:
	"""Convert to JSON-serializable dict."""
	return {
	"total_pages": self.total_pages,
	"pages_analyzed": self.pages_analyzed,
	"summary_stats": self.summary_stats,
	"per_page_results": [
	{
	"page_num": p.page_num,
	"tagged_pdf": p.tagged_pdf,
	"text_len": p.text_len,
	"image_block_count": p.image_block_count,
	"font_count": p.font_count,
	"has_type3_fonts": p.has_type3_fonts,
	"suspicious_garbled_text": p.suspicious_garbled_text,
	"likely_scanned_image_page": p.likely_scanned_image_page,
	"likely_text_as_vector_outlines": p.likely_text_as_vector_outlines,
	"multi_column_guess": p.multi_column_guess,
	"processing_time_ms": p.processing_time_ms,
	}
	for p in self.per_page_results
	],
	"common_issues": self.common_issues,
	"critical_pages": self.critical_pages,
	"processing_time_sec": self.processing_time_sec,
	}

	def _safe_str(x: Any, max_len: int = 400) -> str:
	s = str(x)
	if len(s) > max_len:
	s = s[:max_len] + "…"
	return s

	def _looks_like_math(text: str) -> bool:
	# Heuristic: mathy glyphs/symbols and patterns
	if not text:
	return False
	math_syms = r"[∑∫√≈≠≤≥∞±×÷∂∇∈∩∪⊂⊆⊇⊃→↦∀∃ℝℤℚℕ]"
	latexy = r"(\\frac\|\\sqrt\|\\sum\|\\int\|_\|\^\|\b(?:sin\|cos\|tan\|log\|ln)\b)"
	return bool(re.search(math_syms, text) or re.search(latexy, text))

	def extract_blocks_spans(doc: fitz.Document, page_index: int) -> List[BlockInfo]:
	page = doc[page_index]
	raw = page.get_text("dict") # includes blocks/lines/spans with bboxes
	mat = page.rotation_matrix
	blocks: List[BlockInfo] = []
	for b in raw.get("blocks", []):
	btype = int(b.get("type", -1))

	# Transform block bbox to visual coordinates
	bbox_rect = fitz.Rect(b.get("bbox", (0, 0, 0, 0))) * mat
	bbox = tuple(bbox_rect)

	text_parts: List[str] = []
	spans: List[SpanInfo] = []
	if btype == 0: # text
	for line in b.get("lines", []):
	for sp in line.get("spans", []):
	t = sp.get("text", "")
	if t:
	text_parts.append(t)

	# Transform span bbox to visual coordinates
	sp_bbox_rect = fitz.Rect(sp.get("bbox", (0, 0, 0, 0))) * mat

	spans.append(
	SpanInfo(
	bbox=tuple(sp_bbox_rect),
	text=t,
	font=_safe_str(sp.get("font", "")),
	size=float(sp.get("size", 0.0)),
	)
	)
	text = "".join(text_parts).strip()
	blocks.append(BlockInfo(bbox=bbox, text=text, block_type=btype, spans=spans))
	return blocks

	def order_blocks(blocks: List[BlockInfo], mode: str) -> List[Tuple[int, BlockInfo]]:
	"""
	Return list of (idx, block) in chosen order.
	"""
	indexed = list(enumerate(blocks))
	if mode == "raw":
	return indexed

	def key_tblr(item: Tuple[int, BlockInfo]) -> Tuple[int, int]:
	_, b = item
	x0, y0, x1, y1 = b.bbox
	return (int(y0), int(x0))

	if mode == "tblr":
	return sorted(indexed, key=key_tblr)

	if mode == "columns":
	# Simple 2-column heuristic:
	# cluster by x-center around midline, then sort within each column.
	# This is a heuristic; tagged PDFs should make this unnecessary.
	xs = []
	for _, b in indexed:
	x0, y0, x1, y1 = b.bbox
	if (x1 - x0) > 5:
	xs.append((x0 + x1) / 2.0)
	if not xs:
	return sorted(indexed, key=key_tblr)
	mid = sorted(xs)[len(xs) // 2]

	left = []
	right = []
	for it in indexed:
	_, b = it
	x0, y0, x1, y1 = b.bbox
	cx = (x0 + x1) / 2.0
	(left if cx < mid else right).append(it)

	left = sorted(left, key=key_tblr)
	right = sorted(right, key=key_tblr)

	# Read left column first, then right
	return left + right

	# Fallback
	return sorted(indexed, key=key_tblr)