File size: 5,868 Bytes
0d61aa0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""
Layout Utilities Module

Contains shared logic for block extraction, ordering, and data structures
to avoid circular dependencies between app.py and other modules.
"""

from dataclasses import dataclass
from typing import List, Tuple, Any, Dict, Optional
import pymupdf as fitz
import re

@dataclass
class SpanInfo:
    bbox: Tuple[float, float, float, float]
    text: str
    font: str
    size: float

@dataclass
class BlockInfo:
    bbox: Tuple[float, float, float, float]
    text: str
    block_type: int  # 0 text, 1 image, 2 drawing in PyMuPDF terms for some outputs
    spans: List[SpanInfo]

@dataclass
class PageDiagnostic:
    """Extended diagnostic for batch processing."""
    page_num: int
    tagged_pdf: bool
    text_len: int
    image_block_count: int
    font_count: int
    has_type3_fonts: bool
    suspicious_garbled_text: bool
    likely_scanned_image_page: bool
    likely_text_as_vector_outlines: bool
    multi_column_guess: bool
    processing_time_ms: Optional[int] = None

@dataclass
class BatchAnalysisResult:
    """Aggregate results from all pages."""
    total_pages: int
    pages_analyzed: int
    summary_stats: Dict[str, int]
    per_page_results: List[PageDiagnostic]
    common_issues: List[str]
    critical_pages: List[int]
    processing_time_sec: float

    def to_dict(self) -> Dict[str, Any]:
        """Convert to JSON-serializable dict."""
        return {
            "total_pages": self.total_pages,
            "pages_analyzed": self.pages_analyzed,
            "summary_stats": self.summary_stats,
            "per_page_results": [
                {
                    "page_num": p.page_num,
                    "tagged_pdf": p.tagged_pdf,
                    "text_len": p.text_len,
                    "image_block_count": p.image_block_count,
                    "font_count": p.font_count,
                    "has_type3_fonts": p.has_type3_fonts,
                    "suspicious_garbled_text": p.suspicious_garbled_text,
                    "likely_scanned_image_page": p.likely_scanned_image_page,
                    "likely_text_as_vector_outlines": p.likely_text_as_vector_outlines,
                    "multi_column_guess": p.multi_column_guess,
                    "processing_time_ms": p.processing_time_ms,
                }
                for p in self.per_page_results
            ],
            "common_issues": self.common_issues,
            "critical_pages": self.critical_pages,
            "processing_time_sec": self.processing_time_sec,
        }

def _safe_str(x: Any, max_len: int = 400) -> str:
    s = str(x)
    if len(s) > max_len:
        s = s[:max_len] + "…"
    return s

def _looks_like_math(text: str) -> bool:
    # Heuristic: mathy glyphs/symbols and patterns
    if not text:
        return False
    math_syms = r"[βˆ‘βˆ«βˆšβ‰ˆβ‰ β‰€β‰₯βˆžΒ±Γ—Γ·βˆ‚βˆ‡βˆˆβˆ©βˆͺβŠ‚βŠ†βŠ‡βŠƒβ†’β†¦βˆ€βˆƒβ„β„€β„šβ„•]"
    latexy = r"(\\frac|\\sqrt|\\sum|\\int|_|\^|\b(?:sin|cos|tan|log|ln)\b)"
    return bool(re.search(math_syms, text) or re.search(latexy, text))

def extract_blocks_spans(doc: fitz.Document, page_index: int) -> List[BlockInfo]:
    page = doc[page_index]
    raw = page.get_text("dict")  # includes blocks/lines/spans with bboxes
    mat = page.rotation_matrix
    blocks: List[BlockInfo] = []
    for b in raw.get("blocks", []):
        btype = int(b.get("type", -1))
        
        # Transform block bbox to visual coordinates
        bbox_rect = fitz.Rect(b.get("bbox", (0, 0, 0, 0))) * mat
        bbox = tuple(bbox_rect)
        
        text_parts: List[str] = []
        spans: List[SpanInfo] = []
        if btype == 0:  # text
            for line in b.get("lines", []):
                for sp in line.get("spans", []):
                    t = sp.get("text", "")
                    if t:
                        text_parts.append(t)
                    
                    # Transform span bbox to visual coordinates
                    sp_bbox_rect = fitz.Rect(sp.get("bbox", (0, 0, 0, 0))) * mat
                    
                    spans.append(
                        SpanInfo(
                            bbox=tuple(sp_bbox_rect),
                            text=t,
                            font=_safe_str(sp.get("font", "")),
                            size=float(sp.get("size", 0.0)),
                        )
                    )
        text = "".join(text_parts).strip()
        blocks.append(BlockInfo(bbox=bbox, text=text, block_type=btype, spans=spans))
    return blocks

def order_blocks(blocks: List[BlockInfo], mode: str) -> List[Tuple[int, BlockInfo]]:
    """
    Return list of (idx, block) in chosen order.
    """
    indexed = list(enumerate(blocks))
    if mode == "raw":
        return indexed

    def key_tblr(item: Tuple[int, BlockInfo]) -> Tuple[int, int]:
        _, b = item
        x0, y0, x1, y1 = b.bbox
        return (int(y0), int(x0))

    if mode == "tblr":
        return sorted(indexed, key=key_tblr)

    if mode == "columns":
        # Simple 2-column heuristic:
        # cluster by x-center around midline, then sort within each column.
        # This is a heuristic; tagged PDFs should make this unnecessary.
        xs = []
        for _, b in indexed:
            x0, y0, x1, y1 = b.bbox
            if (x1 - x0) > 5:
                xs.append((x0 + x1) / 2.0)
        if not xs:
            return sorted(indexed, key=key_tblr)
        mid = sorted(xs)[len(xs) // 2]

        left = []
        right = []
        for it in indexed:
            _, b = it
            x0, y0, x1, y1 = b.bbox
            cx = (x0 + x1) / 2.0
            (left if cx < mid else right).append(it)

        left = sorted(left, key=key_tblr)
        right = sorted(right, key=key_tblr)

        # Read left column first, then right
        return left + right

    # Fallback
    return sorted(indexed, key=key_tblr)