File size: 5,501 Bytes
c37e95b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
Docling document parsing with figure extraction and markdown export
"""

from typing import Dict, List, Any
import tempfile
import os


def parse_document(pdf_bytes: bytes) -> Dict[str, Any]:
    """
    Parse PDF with Docling model and extract markdown + figure regions.

    Returns:
        {
            "html": "HTML representation of document (markdown wrapped)",
            "text": "full extracted text",
            "figures": [
                {"bbox": [x, y, w, h], "page": 0},
                ...
            ]
        }
    """
    try:
        from docling.document_converter import DocumentConverter, PdfFormatOption
        from docling.datamodel.base_models import InputFormat

        # Save bytes to temp file (DocumentConverter needs file path)
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
            tmp.write(pdf_bytes)
            tmp_path = tmp.name

        try:
            # Initialize converter with picture image generation enabled
            # Get default format option and modify it
            pdf_format_option = PdfFormatOption()
            # Enable picture image generation
            pdf_format_option.pipeline_options.generate_picture_images = True

            converter = DocumentConverter(format_options={
                InputFormat.PDF: pdf_format_option
            })

            # Convert document using correct API: convert() not convert_document()
            result = converter.convert(tmp_path)
            doc = result.document

            # Export as markdown
            markdown_text = doc.export_to_markdown()

            # Wrap markdown in HTML for display
            html = f"<pre style='white-space: pre-wrap; word-wrap: break-word;'>{markdown_text}</pre>"

            # Extract text
            text = doc.export_to_text()

            # Extract figures (pictures) with bounding boxes using modern Docling API
            figures = []
            try:
                # Modern approach: iterate through doc.pictures directly
                if hasattr(doc, 'pictures'):
                    for figure in doc.pictures:
                        # Skip figures not in the main body (logos, headers, etc.)
                        # Only include figures in the body content layer
                        if figure.content_layer.value != "body":
                            continue

                        # Get page number and bbox from provenance
                        page_num = 0
                        bbox_list = None

                        if figure.prov:
                            # Get page from first provenance entry
                            page_num = figure.prov[0].page_no
                            # Get bbox from first provenance entry
                            bbox = figure.prov[0].bbox
                            bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height]

                        # Get caption from captions list
                        caption = ""
                        if figure.captions:
                            # Captions are RefItem objects pointing to text elements via cref
                            for cap_ref in figure.captions:
                                try:
                                    # Parse the cref like "#/texts/11" to get the index
                                    if hasattr(cap_ref, 'cref') and cap_ref.cref.startswith('#/texts/'):
                                        idx = int(cap_ref.cref.split('/')[-1])
                                        if idx < len(doc.texts):
                                            caption = doc.texts[idx].text
                                            break
                                except Exception:
                                    pass

                        # Get PIL image from ImageRef
                        pil_image = None
                        if figure.image:
                            try:
                                pil_image = figure.image.pil_image

                                figures.append({
                                    "bbox": bbox_list,
                                    "page": page_num,
                                    "caption": caption,
                                    "image": pil_image,  # Store PIL image directly
                                })
                            except Exception as e:
                                pass  # Skip figures without valid images
            except Exception as e:
                # Continue without figures rather than failing completely
                figures = []

            return {
                "html": html,
                "text": text,
                "figures": figures,
            }
        finally:
            # Clean up temp file
            if os.path.exists(tmp_path):
                os.unlink(tmp_path)

    except ImportError as e:
        print(f"⚠️ Docling import error: {e}, using placeholder")
        return {
            "html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>",
            "text": "Sample text from PDF.\n\nDocling not available - using placeholder.",
            "figures": [],
        }
    except Exception as e:
        print(f"⚠️ Docling parse error: {e}")
        import traceback
        traceback.print_exc()
        return {
            "html": f"<h1>Error</h1><pre>{str(e)}</pre>",
            "text": f"Error: {str(e)}",
            "figures": [],
        }