Spaces:
Sleeping
Sleeping
File size: 5,868 Bytes
0d61aa0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | """
Layout Utilities Module
Contains shared logic for block extraction, ordering, and data structures
to avoid circular dependencies between app.py and other modules.
"""
from dataclasses import dataclass
from typing import List, Tuple, Any, Dict, Optional
import pymupdf as fitz
import re
@dataclass
class SpanInfo:
bbox: Tuple[float, float, float, float]
text: str
font: str
size: float
@dataclass
class BlockInfo:
bbox: Tuple[float, float, float, float]
text: str
block_type: int # 0 text, 1 image, 2 drawing in PyMuPDF terms for some outputs
spans: List[SpanInfo]
@dataclass
class PageDiagnostic:
"""Extended diagnostic for batch processing."""
page_num: int
tagged_pdf: bool
text_len: int
image_block_count: int
font_count: int
has_type3_fonts: bool
suspicious_garbled_text: bool
likely_scanned_image_page: bool
likely_text_as_vector_outlines: bool
multi_column_guess: bool
processing_time_ms: Optional[int] = None
@dataclass
class BatchAnalysisResult:
"""Aggregate results from all pages."""
total_pages: int
pages_analyzed: int
summary_stats: Dict[str, int]
per_page_results: List[PageDiagnostic]
common_issues: List[str]
critical_pages: List[int]
processing_time_sec: float
def to_dict(self) -> Dict[str, Any]:
"""Convert to JSON-serializable dict."""
return {
"total_pages": self.total_pages,
"pages_analyzed": self.pages_analyzed,
"summary_stats": self.summary_stats,
"per_page_results": [
{
"page_num": p.page_num,
"tagged_pdf": p.tagged_pdf,
"text_len": p.text_len,
"image_block_count": p.image_block_count,
"font_count": p.font_count,
"has_type3_fonts": p.has_type3_fonts,
"suspicious_garbled_text": p.suspicious_garbled_text,
"likely_scanned_image_page": p.likely_scanned_image_page,
"likely_text_as_vector_outlines": p.likely_text_as_vector_outlines,
"multi_column_guess": p.multi_column_guess,
"processing_time_ms": p.processing_time_ms,
}
for p in self.per_page_results
],
"common_issues": self.common_issues,
"critical_pages": self.critical_pages,
"processing_time_sec": self.processing_time_sec,
}
def _safe_str(x: Any, max_len: int = 400) -> str:
s = str(x)
if len(s) > max_len:
s = s[:max_len] + "β¦"
return s
def _looks_like_math(text: str) -> bool:
# Heuristic: mathy glyphs/symbols and patterns
if not text:
return False
math_syms = r"[ββ«βββ β€β₯βΒ±ΓΓ·ββββ©βͺββββββ¦ββββ€ββ]"
latexy = r"(\\frac|\\sqrt|\\sum|\\int|_|\^|\b(?:sin|cos|tan|log|ln)\b)"
return bool(re.search(math_syms, text) or re.search(latexy, text))
def extract_blocks_spans(doc: fitz.Document, page_index: int) -> List[BlockInfo]:
page = doc[page_index]
raw = page.get_text("dict") # includes blocks/lines/spans with bboxes
mat = page.rotation_matrix
blocks: List[BlockInfo] = []
for b in raw.get("blocks", []):
btype = int(b.get("type", -1))
# Transform block bbox to visual coordinates
bbox_rect = fitz.Rect(b.get("bbox", (0, 0, 0, 0))) * mat
bbox = tuple(bbox_rect)
text_parts: List[str] = []
spans: List[SpanInfo] = []
if btype == 0: # text
for line in b.get("lines", []):
for sp in line.get("spans", []):
t = sp.get("text", "")
if t:
text_parts.append(t)
# Transform span bbox to visual coordinates
sp_bbox_rect = fitz.Rect(sp.get("bbox", (0, 0, 0, 0))) * mat
spans.append(
SpanInfo(
bbox=tuple(sp_bbox_rect),
text=t,
font=_safe_str(sp.get("font", "")),
size=float(sp.get("size", 0.0)),
)
)
text = "".join(text_parts).strip()
blocks.append(BlockInfo(bbox=bbox, text=text, block_type=btype, spans=spans))
return blocks
def order_blocks(blocks: List[BlockInfo], mode: str) -> List[Tuple[int, BlockInfo]]:
"""
Return list of (idx, block) in chosen order.
"""
indexed = list(enumerate(blocks))
if mode == "raw":
return indexed
def key_tblr(item: Tuple[int, BlockInfo]) -> Tuple[int, int]:
_, b = item
x0, y0, x1, y1 = b.bbox
return (int(y0), int(x0))
if mode == "tblr":
return sorted(indexed, key=key_tblr)
if mode == "columns":
# Simple 2-column heuristic:
# cluster by x-center around midline, then sort within each column.
# This is a heuristic; tagged PDFs should make this unnecessary.
xs = []
for _, b in indexed:
x0, y0, x1, y1 = b.bbox
if (x1 - x0) > 5:
xs.append((x0 + x1) / 2.0)
if not xs:
return sorted(indexed, key=key_tblr)
mid = sorted(xs)[len(xs) // 2]
left = []
right = []
for it in indexed:
_, b = it
x0, y0, x1, y1 = b.bbox
cx = (x0 + x1) / 2.0
(left if cx < mid else right).append(it)
left = sorted(left, key=key_tblr)
right = sorted(right, key=key_tblr)
# Read left column first, then right
return left + right
# Fallback
return sorted(indexed, key=key_tblr)
|