File size: 5,501 Bytes
c37e95b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | """
Docling document parsing with figure extraction and markdown export
"""
from typing import Dict, List, Any
import tempfile
import os
def parse_document(pdf_bytes: bytes) -> Dict[str, Any]:
"""
Parse PDF with Docling model and extract markdown + figure regions.
Returns:
{
"html": "HTML representation of document (markdown wrapped)",
"text": "full extracted text",
"figures": [
{"bbox": [x, y, w, h], "page": 0},
...
]
}
"""
try:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
# Save bytes to temp file (DocumentConverter needs file path)
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
tmp.write(pdf_bytes)
tmp_path = tmp.name
try:
# Initialize converter with picture image generation enabled
# Get default format option and modify it
pdf_format_option = PdfFormatOption()
# Enable picture image generation
pdf_format_option.pipeline_options.generate_picture_images = True
converter = DocumentConverter(format_options={
InputFormat.PDF: pdf_format_option
})
# Convert document using correct API: convert() not convert_document()
result = converter.convert(tmp_path)
doc = result.document
# Export as markdown
markdown_text = doc.export_to_markdown()
# Wrap markdown in HTML for display
html = f"<pre style='white-space: pre-wrap; word-wrap: break-word;'>{markdown_text}</pre>"
# Extract text
text = doc.export_to_text()
# Extract figures (pictures) with bounding boxes using modern Docling API
figures = []
try:
# Modern approach: iterate through doc.pictures directly
if hasattr(doc, 'pictures'):
for figure in doc.pictures:
# Skip figures not in the main body (logos, headers, etc.)
# Only include figures in the body content layer
if figure.content_layer.value != "body":
continue
# Get page number and bbox from provenance
page_num = 0
bbox_list = None
if figure.prov:
# Get page from first provenance entry
page_num = figure.prov[0].page_no
# Get bbox from first provenance entry
bbox = figure.prov[0].bbox
bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height]
# Get caption from captions list
caption = ""
if figure.captions:
# Captions are RefItem objects pointing to text elements via cref
for cap_ref in figure.captions:
try:
# Parse the cref like "#/texts/11" to get the index
if hasattr(cap_ref, 'cref') and cap_ref.cref.startswith('#/texts/'):
idx = int(cap_ref.cref.split('/')[-1])
if idx < len(doc.texts):
caption = doc.texts[idx].text
break
except Exception:
pass
# Get PIL image from ImageRef
pil_image = None
if figure.image:
try:
pil_image = figure.image.pil_image
figures.append({
"bbox": bbox_list,
"page": page_num,
"caption": caption,
"image": pil_image, # Store PIL image directly
})
except Exception as e:
pass # Skip figures without valid images
except Exception as e:
# Continue without figures rather than failing completely
figures = []
return {
"html": html,
"text": text,
"figures": figures,
}
finally:
# Clean up temp file
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except ImportError as e:
print(f"⚠️ Docling import error: {e}, using placeholder")
return {
"html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>",
"text": "Sample text from PDF.\n\nDocling not available - using placeholder.",
"figures": [],
}
except Exception as e:
print(f"⚠️ Docling parse error: {e}")
import traceback
traceback.print_exc()
return {
"html": f"<h1>Error</h1><pre>{str(e)}</pre>",
"text": f"Error: {str(e)}",
"figures": [],
}
|