""" Docling document parsing with figure extraction and markdown export """ from typing import Dict, List, Any import tempfile import os def parse_document(pdf_bytes: bytes) -> Dict[str, Any]: """ Parse PDF with Docling model and extract markdown + figure regions. Returns: { "html": "HTML representation of document (markdown wrapped)", "text": "full extracted text", "figures": [ {"bbox": [x, y, w, h], "page": 0}, ... ] } """ try: from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import InputFormat # Save bytes to temp file (DocumentConverter needs file path) with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: tmp.write(pdf_bytes) tmp_path = tmp.name try: # Initialize converter with picture image generation enabled # Get default format option and modify it pdf_format_option = PdfFormatOption() # Enable picture image generation pdf_format_option.pipeline_options.generate_picture_images = True converter = DocumentConverter(format_options={ InputFormat.PDF: pdf_format_option }) # Convert document using correct API: convert() not convert_document() result = converter.convert(tmp_path) doc = result.document # Export as markdown markdown_text = doc.export_to_markdown() # Wrap markdown in HTML for display html = f"
{markdown_text}"
# Extract text
text = doc.export_to_text()
# Extract figures (pictures) with bounding boxes using modern Docling API
figures = []
try:
# Modern approach: iterate through doc.pictures directly
if hasattr(doc, 'pictures'):
for figure in doc.pictures:
# Skip figures not in the main body (logos, headers, etc.)
# Only include figures in the body content layer
if figure.content_layer.value != "body":
continue
# Get page number and bbox from provenance
page_num = 0
bbox_list = None
if figure.prov:
# Get page from first provenance entry
page_num = figure.prov[0].page_no
# Get bbox from first provenance entry
bbox = figure.prov[0].bbox
bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height]
# Get caption from captions list
caption = ""
if figure.captions:
# Captions are RefItem objects pointing to text elements via cref
for cap_ref in figure.captions:
try:
# Parse the cref like "#/texts/11" to get the index
if hasattr(cap_ref, 'cref') and cap_ref.cref.startswith('#/texts/'):
idx = int(cap_ref.cref.split('/')[-1])
if idx < len(doc.texts):
caption = doc.texts[idx].text
break
except Exception:
pass
# Get PIL image from ImageRef
pil_image = None
if figure.image:
try:
pil_image = figure.image.pil_image
figures.append({
"bbox": bbox_list,
"page": page_num,
"caption": caption,
"image": pil_image, # Store PIL image directly
})
except Exception as e:
pass # Skip figures without valid images
except Exception as e:
# Continue without figures rather than failing completely
figures = []
return {
"html": html,
"text": text,
"figures": figures,
}
finally:
# Clean up temp file
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except ImportError as e:
print(f"⚠️ Docling import error: {e}, using placeholder")
return {
"html": "Docling not available - using placeholder.
", "text": "Sample text from PDF.\n\nDocling not available - using placeholder.", "figures": [], } except Exception as e: print(f"⚠️ Docling parse error: {e}") import traceback traceback.print_exc() return { "html": f"{str(e)}",
"text": f"Error: {str(e)}",
"figures": [],
}