Spaces:

rianders
/

pdfinspector

Sleeping

File size: 5,868 Bytes

0d61aa0

"""
Layout Utilities Module

Contains shared logic for block extraction, ordering, and data structures
to avoid circular dependencies between app.py and other modules.
"""

from dataclasses import dataclass
from typing import List, Tuple, Any, Dict, Optional
import pymupdf as fitz
import re

@dataclass
class SpanInfo:
    bbox: Tuple[float, float, float, float]
    text: str
    font: str
    size: float

@dataclass
class BlockInfo:
    bbox: Tuple[float, float, float, float]
    text: str
    block_type: int  # 0 text, 1 image, 2 drawing in PyMuPDF terms for some outputs
    spans: List[SpanInfo]

@dataclass
class PageDiagnostic:
    """Extended diagnostic for batch processing."""
    page_num: int
    tagged_pdf: bool
    text_len: int
    image_block_count: int
    font_count: int
    has_type3_fonts: bool
    suspicious_garbled_text: bool
    likely_scanned_image_page: bool
    likely_text_as_vector_outlines: bool
    multi_column_guess: bool
    processing_time_ms: Optional[int] = None

@dataclass
class BatchAnalysisResult:
    """Aggregate results from all pages."""
    total_pages: int
    pages_analyzed: int
    summary_stats: Dict[str, int]
    per_page_results: List[PageDiagnostic]
    common_issues: List[str]
    critical_pages: List[int]
    processing_time_sec: float

    def to_dict(self) -> Dict[str, Any]:
        """Convert to JSON-serializable dict."""
        return {
            "total_pages": self.total_pages,
            "pages_analyzed": self.pages_analyzed,
            "summary_stats": self.summary_stats,
            "per_page_results": [
                {
                    "page_num": p.page_num,
                    "tagged_pdf": p.tagged_pdf,
                    "text_len": p.text_len,
                    "image_block_count": p.image_block_count,
                    "font_count": p.font_count,
                    "has_type3_fonts": p.has_type3_fonts,
                    "suspicious_garbled_text": p.suspicious_garbled_text,
                    "likely_scanned_image_page": p.likely_scanned_image_page,
                    "likely_text_as_vector_outlines": p.likely_text_as_vector_outlines,
                    "multi_column_guess": p.multi_column_guess,
                    "processing_time_ms": p.processing_time_ms,
                }
                for p in self.per_page_results
            ],
            "common_issues": self.common_issues,
            "critical_pages": self.critical_pages,
            "processing_time_sec": self.processing_time_sec,
        }

def _safe_str(x: Any, max_len: int = 400) -> str:
    s = str(x)
    if len(s) > max_len:
        s = s[:max_len] + "…"
    return s

def _looks_like_math(text: str) -> bool:
    # Heuristic: mathy glyphs/symbols and patterns
    if not text:
        return False
    math_syms = r"[∑∫√≈≠≤≥∞±×÷∂∇∈∩∪⊂⊆⊇⊃→↦∀∃ℝℤℚℕ]"
    latexy = r"(\\frac|\\sqrt|\\sum|\\int|_|\^|\b(?:sin|cos|tan|log|ln)\b)"
    return bool(re.search(math_syms, text) or re.search(latexy, text))

def extract_blocks_spans(doc: fitz.Document, page_index: int) -> List[BlockInfo]:
    page = doc[page_index]
    raw = page.get_text("dict")  # includes blocks/lines/spans with bboxes
    mat = page.rotation_matrix
    blocks: List[BlockInfo] = []
    for b in raw.get("blocks", []):
        btype = int(b.get("type", -1))
        
        # Transform block bbox to visual coordinates
        bbox_rect = fitz.Rect(b.get("bbox", (0, 0, 0, 0))) * mat
        bbox = tuple(bbox_rect)
        
        text_parts: List[str] = []
        spans: List[SpanInfo] = []
        if btype == 0:  # text
            for line in b.get("lines", []):
                for sp in line.get("spans", []):
                    t = sp.get("text", "")
                    if t:
                        text_parts.append(t)
                    
                    # Transform span bbox to visual coordinates
                    sp_bbox_rect = fitz.Rect(sp.get("bbox", (0, 0, 0, 0))) * mat
                    
                    spans.append(
                        SpanInfo(
                            bbox=tuple(sp_bbox_rect),
                            text=t,
                            font=_safe_str(sp.get("font", "")),
                            size=float(sp.get("size", 0.0)),
                        )
                    )
        text = "".join(text_parts).strip()
        blocks.append(BlockInfo(bbox=bbox, text=text, block_type=btype, spans=spans))
    return blocks

def order_blocks(blocks: List[BlockInfo], mode: str) -> List[Tuple[int, BlockInfo]]:
    """
    Return list of (idx, block) in chosen order.
    """
    indexed = list(enumerate(blocks))
    if mode == "raw":
        return indexed

    def key_tblr(item: Tuple[int, BlockInfo]) -> Tuple[int, int]:
        _, b = item
        x0, y0, x1, y1 = b.bbox
        return (int(y0), int(x0))

    if mode == "tblr":
        return sorted(indexed, key=key_tblr)

    if mode == "columns":
        # Simple 2-column heuristic:
        # cluster by x-center around midline, then sort within each column.
        # This is a heuristic; tagged PDFs should make this unnecessary.
        xs = []
        for _, b in indexed:
            x0, y0, x1, y1 = b.bbox
            if (x1 - x0) > 5:
                xs.append((x0 + x1) / 2.0)
        if not xs:
            return sorted(indexed, key=key_tblr)
        mid = sorted(xs)[len(xs) // 2]

        left = []
        right = []
        for it in indexed:
            _, b = it
            x0, y0, x1, y1 = b.bbox
            cx = (x0 + x1) / 2.0
            (left if cx < mid else right).append(it)

        left = sorted(left, key=key_tblr)
        right = sorted(right, key=key_tblr)

        # Read left column first, then right
        return left + right

    # Fallback
    return sorted(indexed, key=key_tblr)