Spaces:
Sleeping
Sleeping
| """ | |
| Layout Utilities Module | |
| Contains shared logic for block extraction, ordering, and data structures | |
| to avoid circular dependencies between app.py and other modules. | |
| """ | |
| from dataclasses import dataclass | |
| from typing import List, Tuple, Any, Dict, Optional | |
| import pymupdf as fitz | |
| import re | |
| class SpanInfo: | |
| bbox: Tuple[float, float, float, float] | |
| text: str | |
| font: str | |
| size: float | |
| class BlockInfo: | |
| bbox: Tuple[float, float, float, float] | |
| text: str | |
| block_type: int # 0 text, 1 image, 2 drawing in PyMuPDF terms for some outputs | |
| spans: List[SpanInfo] | |
| class PageDiagnostic: | |
| """Extended diagnostic for batch processing.""" | |
| page_num: int | |
| tagged_pdf: bool | |
| text_len: int | |
| image_block_count: int | |
| font_count: int | |
| has_type3_fonts: bool | |
| suspicious_garbled_text: bool | |
| likely_scanned_image_page: bool | |
| likely_text_as_vector_outlines: bool | |
| multi_column_guess: bool | |
| processing_time_ms: Optional[int] = None | |
| class BatchAnalysisResult: | |
| """Aggregate results from all pages.""" | |
| total_pages: int | |
| pages_analyzed: int | |
| summary_stats: Dict[str, int] | |
| per_page_results: List[PageDiagnostic] | |
| common_issues: List[str] | |
| critical_pages: List[int] | |
| processing_time_sec: float | |
| def to_dict(self) -> Dict[str, Any]: | |
| """Convert to JSON-serializable dict.""" | |
| return { | |
| "total_pages": self.total_pages, | |
| "pages_analyzed": self.pages_analyzed, | |
| "summary_stats": self.summary_stats, | |
| "per_page_results": [ | |
| { | |
| "page_num": p.page_num, | |
| "tagged_pdf": p.tagged_pdf, | |
| "text_len": p.text_len, | |
| "image_block_count": p.image_block_count, | |
| "font_count": p.font_count, | |
| "has_type3_fonts": p.has_type3_fonts, | |
| "suspicious_garbled_text": p.suspicious_garbled_text, | |
| "likely_scanned_image_page": p.likely_scanned_image_page, | |
| "likely_text_as_vector_outlines": p.likely_text_as_vector_outlines, | |
| "multi_column_guess": p.multi_column_guess, | |
| "processing_time_ms": p.processing_time_ms, | |
| } | |
| for p in self.per_page_results | |
| ], | |
| "common_issues": self.common_issues, | |
| "critical_pages": self.critical_pages, | |
| "processing_time_sec": self.processing_time_sec, | |
| } | |
| def _safe_str(x: Any, max_len: int = 400) -> str: | |
| s = str(x) | |
| if len(s) > max_len: | |
| s = s[:max_len] + "β¦" | |
| return s | |
| def _looks_like_math(text: str) -> bool: | |
| # Heuristic: mathy glyphs/symbols and patterns | |
| if not text: | |
| return False | |
| math_syms = r"[ββ«βββ β€β₯βΒ±ΓΓ·ββββ©βͺββββββ¦ββββ€ββ]" | |
| latexy = r"(\\frac|\\sqrt|\\sum|\\int|_|\^|\b(?:sin|cos|tan|log|ln)\b)" | |
| return bool(re.search(math_syms, text) or re.search(latexy, text)) | |
| def extract_blocks_spans(doc: fitz.Document, page_index: int) -> List[BlockInfo]: | |
| page = doc[page_index] | |
| raw = page.get_text("dict") # includes blocks/lines/spans with bboxes | |
| mat = page.rotation_matrix | |
| blocks: List[BlockInfo] = [] | |
| for b in raw.get("blocks", []): | |
| btype = int(b.get("type", -1)) | |
| # Transform block bbox to visual coordinates | |
| bbox_rect = fitz.Rect(b.get("bbox", (0, 0, 0, 0))) * mat | |
| bbox = tuple(bbox_rect) | |
| text_parts: List[str] = [] | |
| spans: List[SpanInfo] = [] | |
| if btype == 0: # text | |
| for line in b.get("lines", []): | |
| for sp in line.get("spans", []): | |
| t = sp.get("text", "") | |
| if t: | |
| text_parts.append(t) | |
| # Transform span bbox to visual coordinates | |
| sp_bbox_rect = fitz.Rect(sp.get("bbox", (0, 0, 0, 0))) * mat | |
| spans.append( | |
| SpanInfo( | |
| bbox=tuple(sp_bbox_rect), | |
| text=t, | |
| font=_safe_str(sp.get("font", "")), | |
| size=float(sp.get("size", 0.0)), | |
| ) | |
| ) | |
| text = "".join(text_parts).strip() | |
| blocks.append(BlockInfo(bbox=bbox, text=text, block_type=btype, spans=spans)) | |
| return blocks | |
| def order_blocks(blocks: List[BlockInfo], mode: str) -> List[Tuple[int, BlockInfo]]: | |
| """ | |
| Return list of (idx, block) in chosen order. | |
| """ | |
| indexed = list(enumerate(blocks)) | |
| if mode == "raw": | |
| return indexed | |
| def key_tblr(item: Tuple[int, BlockInfo]) -> Tuple[int, int]: | |
| _, b = item | |
| x0, y0, x1, y1 = b.bbox | |
| return (int(y0), int(x0)) | |
| if mode == "tblr": | |
| return sorted(indexed, key=key_tblr) | |
| if mode == "columns": | |
| # Simple 2-column heuristic: | |
| # cluster by x-center around midline, then sort within each column. | |
| # This is a heuristic; tagged PDFs should make this unnecessary. | |
| xs = [] | |
| for _, b in indexed: | |
| x0, y0, x1, y1 = b.bbox | |
| if (x1 - x0) > 5: | |
| xs.append((x0 + x1) / 2.0) | |
| if not xs: | |
| return sorted(indexed, key=key_tblr) | |
| mid = sorted(xs)[len(xs) // 2] | |
| left = [] | |
| right = [] | |
| for it in indexed: | |
| _, b = it | |
| x0, y0, x1, y1 = b.bbox | |
| cx = (x0 + x1) / 2.0 | |
| (left if cx < mid else right).append(it) | |
| left = sorted(left, key=key_tblr) | |
| right = sorted(right, key=key_tblr) | |
| # Read left column first, then right | |
| return left + right | |
| # Fallback | |
| return sorted(indexed, key=key_tblr) | |