""" Docling document parsing with figure extraction and markdown export """ from typing import Dict, List, Any import tempfile import os def parse_document(pdf_bytes: bytes) -> Dict[str, Any]: """ Parse PDF with Docling model and extract markdown + figure regions. Returns: { "html": "HTML representation of document (markdown wrapped)", "text": "full extracted text", "figures": [ {"bbox": [x, y, w, h], "page": 0}, ... ] } """ try: from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import InputFormat # Save bytes to temp file (DocumentConverter needs file path) with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: tmp.write(pdf_bytes) tmp_path = tmp.name try: # Initialize converter with picture image generation enabled # Get default format option and modify it pdf_format_option = PdfFormatOption() # Enable picture image generation pdf_format_option.pipeline_options.generate_picture_images = True converter = DocumentConverter(format_options={ InputFormat.PDF: pdf_format_option }) # Convert document using correct API: convert() not convert_document() result = converter.convert(tmp_path) doc = result.document # Export as markdown markdown_text = doc.export_to_markdown() # Wrap markdown in HTML for display html = f"
{markdown_text}
" # Extract text text = doc.export_to_text() # Extract figures (pictures) with bounding boxes using modern Docling API figures = [] try: # Modern approach: iterate through doc.pictures directly if hasattr(doc, 'pictures'): for figure in doc.pictures: # Skip figures not in the main body (logos, headers, etc.) # Only include figures in the body content layer if figure.content_layer.value != "body": continue # Get page number and bbox from provenance page_num = 0 bbox_list = None if figure.prov: # Get page from first provenance entry page_num = figure.prov[0].page_no # Get bbox from first provenance entry bbox = figure.prov[0].bbox bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height] # Get caption from captions list caption = "" if figure.captions: # Captions are RefItem objects pointing to text elements via cref for cap_ref in figure.captions: try: # Parse the cref like "#/texts/11" to get the index if hasattr(cap_ref, 'cref') and cap_ref.cref.startswith('#/texts/'): idx = int(cap_ref.cref.split('/')[-1]) if idx < len(doc.texts): caption = doc.texts[idx].text break except Exception: pass # Get PIL image from ImageRef pil_image = None if figure.image: try: pil_image = figure.image.pil_image figures.append({ "bbox": bbox_list, "page": page_num, "caption": caption, "image": pil_image, # Store PIL image directly }) except Exception as e: pass # Skip figures without valid images except Exception as e: # Continue without figures rather than failing completely figures = [] return { "html": html, "text": text, "figures": figures, } finally: # Clean up temp file if os.path.exists(tmp_path): os.unlink(tmp_path) except ImportError as e: print(f"⚠️ Docling import error: {e}, using placeholder") return { "html": "

Sample Document

Docling not available - using placeholder.

", "text": "Sample text from PDF.\n\nDocling not available - using placeholder.", "figures": [], } except Exception as e: print(f"⚠️ Docling parse error: {e}") import traceback traceback.print_exc() return { "html": f"

Error

{str(e)}
", "text": f"Error: {str(e)}", "figures": [], }