""" Layout Utilities Module Contains shared logic for block extraction, ordering, and data structures to avoid circular dependencies between app.py and other modules. """ from dataclasses import dataclass from typing import List, Tuple, Any, Dict, Optional import pymupdf as fitz import re @dataclass class SpanInfo: bbox: Tuple[float, float, float, float] text: str font: str size: float @dataclass class BlockInfo: bbox: Tuple[float, float, float, float] text: str block_type: int # 0 text, 1 image, 2 drawing in PyMuPDF terms for some outputs spans: List[SpanInfo] @dataclass class PageDiagnostic: """Extended diagnostic for batch processing.""" page_num: int tagged_pdf: bool text_len: int image_block_count: int font_count: int has_type3_fonts: bool suspicious_garbled_text: bool likely_scanned_image_page: bool likely_text_as_vector_outlines: bool multi_column_guess: bool processing_time_ms: Optional[int] = None @dataclass class BatchAnalysisResult: """Aggregate results from all pages.""" total_pages: int pages_analyzed: int summary_stats: Dict[str, int] per_page_results: List[PageDiagnostic] common_issues: List[str] critical_pages: List[int] processing_time_sec: float def to_dict(self) -> Dict[str, Any]: """Convert to JSON-serializable dict.""" return { "total_pages": self.total_pages, "pages_analyzed": self.pages_analyzed, "summary_stats": self.summary_stats, "per_page_results": [ { "page_num": p.page_num, "tagged_pdf": p.tagged_pdf, "text_len": p.text_len, "image_block_count": p.image_block_count, "font_count": p.font_count, "has_type3_fonts": p.has_type3_fonts, "suspicious_garbled_text": p.suspicious_garbled_text, "likely_scanned_image_page": p.likely_scanned_image_page, "likely_text_as_vector_outlines": p.likely_text_as_vector_outlines, "multi_column_guess": p.multi_column_guess, "processing_time_ms": p.processing_time_ms, } for p in self.per_page_results ], "common_issues": self.common_issues, "critical_pages": self.critical_pages, "processing_time_sec": self.processing_time_sec, } def _safe_str(x: Any, max_len: int = 400) -> str: s = str(x) if len(s) > max_len: s = s[:max_len] + "…" return s def _looks_like_math(text: str) -> bool: # Heuristic: mathy glyphs/symbols and patterns if not text: return False math_syms = r"[∑∫√≈≠≤≥∞±×÷∂∇∈∩∪⊂⊆⊇⊃→↦∀∃ℝℤℚℕ]" latexy = r"(\\frac|\\sqrt|\\sum|\\int|_|\^|\b(?:sin|cos|tan|log|ln)\b)" return bool(re.search(math_syms, text) or re.search(latexy, text)) def extract_blocks_spans(doc: fitz.Document, page_index: int) -> List[BlockInfo]: page = doc[page_index] raw = page.get_text("dict") # includes blocks/lines/spans with bboxes mat = page.rotation_matrix blocks: List[BlockInfo] = [] for b in raw.get("blocks", []): btype = int(b.get("type", -1)) # Transform block bbox to visual coordinates bbox_rect = fitz.Rect(b.get("bbox", (0, 0, 0, 0))) * mat bbox = tuple(bbox_rect) text_parts: List[str] = [] spans: List[SpanInfo] = [] if btype == 0: # text for line in b.get("lines", []): for sp in line.get("spans", []): t = sp.get("text", "") if t: text_parts.append(t) # Transform span bbox to visual coordinates sp_bbox_rect = fitz.Rect(sp.get("bbox", (0, 0, 0, 0))) * mat spans.append( SpanInfo( bbox=tuple(sp_bbox_rect), text=t, font=_safe_str(sp.get("font", "")), size=float(sp.get("size", 0.0)), ) ) text = "".join(text_parts).strip() blocks.append(BlockInfo(bbox=bbox, text=text, block_type=btype, spans=spans)) return blocks def order_blocks(blocks: List[BlockInfo], mode: str) -> List[Tuple[int, BlockInfo]]: """ Return list of (idx, block) in chosen order. """ indexed = list(enumerate(blocks)) if mode == "raw": return indexed def key_tblr(item: Tuple[int, BlockInfo]) -> Tuple[int, int]: _, b = item x0, y0, x1, y1 = b.bbox return (int(y0), int(x0)) if mode == "tblr": return sorted(indexed, key=key_tblr) if mode == "columns": # Simple 2-column heuristic: # cluster by x-center around midline, then sort within each column. # This is a heuristic; tagged PDFs should make this unnecessary. xs = [] for _, b in indexed: x0, y0, x1, y1 = b.bbox if (x1 - x0) > 5: xs.append((x0 + x1) / 2.0) if not xs: return sorted(indexed, key=key_tblr) mid = sorted(xs)[len(xs) // 2] left = [] right = [] for it in indexed: _, b = it x0, y0, x1, y1 = b.bbox cx = (x0 + x1) / 2.0 (left if cx < mid else right).append(it) left = sorted(left, key=key_tblr) right = sorted(right, key=key_tblr) # Read left column first, then right return left + right # Fallback return sorted(indexed, key=key_tblr)