File size: 14,972 Bytes
27fda3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
"""
Structure Tree Analysis Module

Provides functionality for extracting and analyzing PDF structure trees,
including paragraph detection and block-to-tag mapping.
"""

from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any, Tuple
import pikepdf
import statistics
from collections import Counter


@dataclass
class StructureNode:
    """Represents a node in the PDF structure tree."""
    tag_type: str  # P, H1, Document, etc.
    depth: int
    mcid: Optional[int] = None
    alt_text: Optional[str] = None
    actual_text: Optional[str] = None
    page_ref: Optional[int] = None
    children: List['StructureNode'] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        return {
            'tag_type': self.tag_type,
            'depth': self.depth,
            'mcid': self.mcid,
            'alt_text': self.alt_text,
            'actual_text': self.actual_text,
            'page_ref': self.page_ref,
            'children': [child.to_dict() for child in self.children]
        }


def extract_structure_tree(pdf_path: str) -> Optional[StructureNode]:
    """
    Extract the complete structure tree from a PDF.

    Args:
        pdf_path: Path to the PDF file

    Returns:
        Root StructureNode or None if no structure tree exists
    """
    try:
        with pikepdf.open(pdf_path) as pdf:
            if '/StructTreeRoot' not in pdf.Root:
                return None

            struct_root = pdf.Root.StructTreeRoot

            # Create root node
            root_node = StructureNode(
                tag_type="StructTreeRoot",
                depth=0
            )

            # Recursively parse the tree
            if '/K' in struct_root:
                _parse_structure_element(struct_root.K, root_node, 1, pdf)

            return root_node

    except Exception as e:
        print(f"Error extracting structure tree: {e}")
        return None


def _parse_structure_element(element, parent_node: StructureNode, depth: int, pdf: pikepdf.Pdf, max_depth: int = 20):
    """
    Recursively parse a structure element and its children.

    Args:
        element: pikepdf object representing the element
        parent_node: Parent StructureNode to attach children to
        depth: Current depth in the tree
        pdf: pikepdf.Pdf object for resolving references
        max_depth: Maximum recursion depth to prevent infinite loops
    """
    if depth > max_depth:
        return

    # Handle arrays of elements
    if isinstance(element, pikepdf.Array):
        for item in element:
            _parse_structure_element(item, parent_node, depth, pdf, max_depth)
        return

    # Handle MCID (Marked Content ID) - leaf node
    if isinstance(element, int):
        node = StructureNode(
            tag_type="MCID",
            depth=depth,
            mcid=element
        )
        parent_node.children.append(node)
        return

    # Handle dictionary (structure element)
    if isinstance(element, pikepdf.Dictionary):
        # Extract tag type
        tag_type = str(element.get('/S', 'Unknown'))
        if tag_type.startswith('/'):
            tag_type = tag_type[1:]  # Remove leading slash

        # Extract attributes
        mcid = None
        if '/MCID' in element:
            mcid = int(element.MCID)

        alt_text = None
        if '/Alt' in element:
            try:
                alt_text = str(element.Alt)
            except:
                pass

        actual_text = None
        if '/ActualText' in element:
            try:
                actual_text = str(element.ActualText)
            except:
                pass

        page_ref = None
        if '/Pg' in element:
            try:
                # Find the page number
                page_obj = element.Pg
                for i, page in enumerate(pdf.pages):
                    if page.obj == page_obj:
                        page_ref = i
                        break
            except:
                pass

        # Create node
        node = StructureNode(
            tag_type=tag_type,
            depth=depth,
            mcid=mcid,
            alt_text=alt_text,
            actual_text=actual_text,
            page_ref=page_ref
        )
        parent_node.children.append(node)

        # Recursively process children
        if '/K' in element:
            _parse_structure_element(element.K, node, depth + 1, pdf, max_depth)


def format_tree_text(root: StructureNode, max_nodes: int = 500) -> str:
    """
    Format structure tree as indented text with box-drawing characters.

    Args:
        root: Root StructureNode
        max_nodes: Maximum number of nodes to display

    Returns:
        Formatted text representation
    """
    lines = []
    node_count = [0]  # Use list to allow modification in nested function

    def _format_node(node: StructureNode, prefix: str = "", is_last: bool = True):
        if node_count[0] >= max_nodes:
            if node_count[0] == max_nodes:
                lines.append(f"{prefix}... (truncated, tree too large)")
                node_count[0] += 1
            return

        # Format node info
        info = node.tag_type
        if node.mcid is not None:
            info += f" [MCID: {node.mcid}]"
        if node.alt_text:
            info += f" (Alt: {node.alt_text[:30]}...)" if len(node.alt_text) > 30 else f" (Alt: {node.alt_text})"
        if node.actual_text:
            info += f" (Text: {node.actual_text[:30]}...)" if len(node.actual_text) > 30 else f" (Text: {node.actual_text})"
        if node.page_ref is not None:
            info += f" [Page {node.page_ref + 1}]"

        # Add line with appropriate prefix
        if node.depth == 0:
            lines.append(info)
        else:
            connector = "└── " if is_last else "β”œβ”€β”€ "
            lines.append(f"{prefix}{connector}{info}")

        node_count[0] += 1

        # Process children
        if node.children:
            extension = "    " if is_last else "β”‚   "
            new_prefix = prefix + extension if node.depth > 0 else ""

            for i, child in enumerate(node.children):
                is_last_child = (i == len(node.children) - 1)
                _format_node(child, new_prefix, is_last_child)

    _format_node(root)
    return "\n".join(lines)


def get_tree_statistics(root: StructureNode) -> Dict[str, Any]:
    """
    Calculate statistics about the structure tree.

    Args:
        root: Root StructureNode

    Returns:
        Dictionary of statistics
    """
    node_count = 0
    max_depth = 0
    tag_counts = Counter()
    pages_with_structure = set()
    nodes_with_alt_text = 0
    nodes_with_actual_text = 0
    mcid_count = 0

    def _traverse(node: StructureNode):
        nonlocal node_count, max_depth, nodes_with_alt_text, nodes_with_actual_text, mcid_count

        node_count += 1
        max_depth = max(max_depth, node.depth)
        tag_counts[node.tag_type] += 1

        if node.page_ref is not None:
            pages_with_structure.add(node.page_ref)
        if node.alt_text:
            nodes_with_alt_text += 1
        if node.actual_text:
            nodes_with_actual_text += 1
        if node.mcid is not None:
            mcid_count += 1

        for child in node.children:
            _traverse(child)

    _traverse(root)

    return {
        'total_nodes': node_count,
        'max_depth': max_depth,
        'tag_type_counts': dict(tag_counts.most_common()),
        'pages_with_structure': sorted(list(pages_with_structure)),
        'nodes_with_alt_text': nodes_with_alt_text,
        'nodes_with_actual_text': nodes_with_actual_text,
        'mcid_count': mcid_count
    }


def format_statistics_markdown(stats: Dict[str, Any]) -> str:
    """Format tree statistics as Markdown."""
    lines = [
        "## Structure Tree Statistics",
        "",
        f"**Total Nodes**: {stats['total_nodes']}",
        f"**Maximum Depth**: {stats['max_depth']}",
        f"**Nodes with Alt Text**: {stats['nodes_with_alt_text']}",
        f"**Nodes with Actual Text**: {stats['nodes_with_actual_text']}",
        f"**MCID References**: {stats['mcid_count']}",
        "",
        "### Tag Type Distribution",
        ""
    ]

    for tag, count in stats['tag_type_counts'].items():
        lines.append(f"- **{tag}**: {count}")

    lines.extend([
        "",
        f"**Pages with Structure**: {len(stats['pages_with_structure'])}"
    ])

    if stats['pages_with_structure']:
        page_list = ", ".join([str(p + 1) for p in stats['pages_with_structure'][:20]])
        if len(stats['pages_with_structure']) > 20:
            page_list += f" ... ({len(stats['pages_with_structure']) - 20} more)"
        lines.append(f"({page_list})")

    return "\n".join(lines)


def extract_mcid_for_page(pdf_path: str, page_index: int) -> List[int]:
    """
    Extract all MCIDs (Marked Content IDs) from a page's content stream.

    Args:
        pdf_path: Path to PDF file
        page_index: 0-based page index

    Returns:
        List of MCIDs found in the page
    """
    import re

    try:
        with pikepdf.open(pdf_path) as pdf:
            page = pdf.pages[page_index]

            # Get the content stream
            if '/Contents' not in page:
                return []

            # Extract content stream as text
            contents = page.Contents
            if isinstance(contents, pikepdf.Array):
                # Multiple content streams
                stream_data = b""
                for stream in contents:
                    stream_data += bytes(stream.read_bytes())
            else:
                stream_data = bytes(contents.read_bytes())

            # Decode content stream
            try:
                content_text = stream_data.decode('latin-1')
            except:
                content_text = stream_data.decode('utf-8', errors='ignore')

            # Extract MCIDs using regex
            # Pattern: /MCID <number> BDC or /MCID <number> >> BDC
            mcid_pattern = r'/MCID\s+(\d+)'
            matches = re.findall(mcid_pattern, content_text)

            return [int(m) for m in matches]

    except Exception as e:
        print(f"Error extracting MCIDs: {e}")
        return []


def map_blocks_to_tags(pdf_path: str, page_index: int, blocks) -> List[Dict[str, Any]]:
    """
    Map visual blocks to structure tree tags via MCIDs.

    Args:
        pdf_path: Path to PDF file
        page_index: 0-based page index
        blocks: List of BlockInfo objects from extract_blocks_spans

    Returns:
        List of mappings with block index, tag info, MCID, alt text
    """
    # Extract structure tree
    root = extract_structure_tree(pdf_path)
    if not root:
        return []

    # Get MCIDs from page
    page_mcids = extract_mcid_for_page(pdf_path, page_index)

    # Build MCID to structure node mapping
    mcid_to_node = {}

    def _find_mcids(node: StructureNode):
        if node.mcid is not None and (node.page_ref is None or node.page_ref == page_index):
            mcid_to_node[node.mcid] = node
        for child in node.children:
            _find_mcids(child)

    _find_mcids(root)

    # Create mappings
    mappings = []
    for i, mcid in enumerate(page_mcids):
        if i < len(blocks) and mcid in mcid_to_node:
            node = mcid_to_node[mcid]
            mappings.append({
                'block_index': i,
                'tag_type': node.tag_type,
                'mcid': mcid,
                'alt_text': node.alt_text or "",
                'actual_text': node.actual_text or ""
            })

    return mappings


def detect_visual_paragraphs(blocks, vertical_gap_threshold: float = 15.0) -> List[List[int]]:
    """
    Detect visual paragraphs based on spacing heuristics.

    Args:
        blocks: List of BlockInfo objects
        vertical_gap_threshold: Minimum vertical gap to consider a paragraph break

    Returns:
        List of paragraph groups, where each group is a list of block indices
    """
    # Filter text blocks and sort by position
    text_blocks = [(i, b) for i, b in enumerate(blocks) if b.block_type == 0 and b.text.strip()]
    if not text_blocks:
        return []

    # Sort by vertical position (top to bottom)
    text_blocks.sort(key=lambda x: x[1].bbox[1])

    paragraphs = []
    current_paragraph = [text_blocks[0][0]]
    prev_bbox = text_blocks[0][1].bbox

    for idx, block in text_blocks[1:]:
        bbox = block.bbox

        # Calculate vertical gap
        vertical_gap = bbox[1] - prev_bbox[3]

        # Check if blocks are roughly aligned horizontally (same column)
        horizontal_overlap = min(bbox[2], prev_bbox[2]) - max(bbox[0], prev_bbox[0])

        if vertical_gap < vertical_gap_threshold and horizontal_overlap > 0:
            # Same paragraph
            current_paragraph.append(idx)
        else:
            # New paragraph
            paragraphs.append(current_paragraph)
            current_paragraph = [idx]

        prev_bbox = bbox

    # Add last paragraph
    if current_paragraph:
        paragraphs.append(current_paragraph)

    return paragraphs


def detect_semantic_paragraphs(pdf_path: str, page_index: int) -> List[StructureNode]:
    """
    Extract semantic paragraph tags (<P>) from structure tree.

    Args:
        pdf_path: Path to PDF file
        page_index: 0-based page index

    Returns:
        List of StructureNode objects with tag_type='P' for the page
    """
    root = extract_structure_tree(pdf_path)
    if not root:
        return []

    paragraphs = []

    def _find_paragraphs(node: StructureNode):
        if node.tag_type == 'P' and (node.page_ref is None or node.page_ref == page_index):
            paragraphs.append(node)
        for child in node.children:
            _find_paragraphs(child)

    _find_paragraphs(root)
    return paragraphs


def compare_paragraphs(visual_paragraphs: List[List[int]], semantic_paragraphs: List[StructureNode]) -> Dict[str, Any]:
    """
    Compare visual and semantic paragraph detection.

    Args:
        visual_paragraphs: List of visual paragraph groups (block indices)
        semantic_paragraphs: List of semantic <P> tags

    Returns:
        Dictionary with comparison statistics
    """
    visual_count = len(visual_paragraphs)
    semantic_count = len(semantic_paragraphs)

    # Calculate match quality score (simple heuristic)
    if visual_count == 0 and semantic_count == 0:
        match_score = 1.0
    elif visual_count == 0 or semantic_count == 0:
        match_score = 0.0
    else:
        # Score based on count similarity
        match_score = min(visual_count, semantic_count) / max(visual_count, semantic_count)

    return {
        'visual_count': visual_count,
        'semantic_count': semantic_count,
        'match_score': match_score,
        'count_mismatch': abs(visual_count - semantic_count)
    }