Spaces:

pengyuan
/

DocAI

Running

File size: 5,501 Bytes

c37e95b

"""
Docling document parsing with figure extraction and markdown export
"""

from typing import Dict, List, Any
import tempfile
import os


def parse_document(pdf_bytes: bytes) -> Dict[str, Any]:
    """
    Parse PDF with Docling model and extract markdown + figure regions.

    Returns:
        {
            "html": "HTML representation of document (markdown wrapped)",
            "text": "full extracted text",
            "figures": [
                {"bbox": [x, y, w, h], "page": 0},
                ...
            ]
        }
    """
    try:
        from docling.document_converter import DocumentConverter, PdfFormatOption
        from docling.datamodel.base_models import InputFormat

        # Save bytes to temp file (DocumentConverter needs file path)
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
            tmp.write(pdf_bytes)
            tmp_path = tmp.name

        try:
            # Initialize converter with picture image generation enabled
            # Get default format option and modify it
            pdf_format_option = PdfFormatOption()
            # Enable picture image generation
            pdf_format_option.pipeline_options.generate_picture_images = True

            converter = DocumentConverter(format_options={
                InputFormat.PDF: pdf_format_option
            })

            # Convert document using correct API: convert() not convert_document()
            result = converter.convert(tmp_path)
            doc = result.document

            # Export as markdown
            markdown_text = doc.export_to_markdown()

            # Wrap markdown in HTML for display
            html = f"<pre style='white-space: pre-wrap; word-wrap: break-word;'>{markdown_text}</pre>"

            # Extract text
            text = doc.export_to_text()

            # Extract figures (pictures) with bounding boxes using modern Docling API
            figures = []
            try:
                # Modern approach: iterate through doc.pictures directly
                if hasattr(doc, 'pictures'):
                    for figure in doc.pictures:
                        # Skip figures not in the main body (logos, headers, etc.)
                        # Only include figures in the body content layer
                        if figure.content_layer.value != "body":
                            continue

                        # Get page number and bbox from provenance
                        page_num = 0
                        bbox_list = None

                        if figure.prov:
                            # Get page from first provenance entry
                            page_num = figure.prov[0].page_no
                            # Get bbox from first provenance entry
                            bbox = figure.prov[0].bbox
                            bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height]

                        # Get caption from captions list
                        caption = ""
                        if figure.captions:
                            # Captions are RefItem objects pointing to text elements via cref
                            for cap_ref in figure.captions:
                                try:
                                    # Parse the cref like "#/texts/11" to get the index
                                    if hasattr(cap_ref, 'cref') and cap_ref.cref.startswith('#/texts/'):
                                        idx = int(cap_ref.cref.split('/')[-1])
                                        if idx < len(doc.texts):
                                            caption = doc.texts[idx].text
                                            break
                                except Exception:
                                    pass

                        # Get PIL image from ImageRef
                        pil_image = None
                        if figure.image:
                            try:
                                pil_image = figure.image.pil_image

                                figures.append({
                                    "bbox": bbox_list,
                                    "page": page_num,
                                    "caption": caption,
                                    "image": pil_image,  # Store PIL image directly
                                })
                            except Exception as e:
                                pass  # Skip figures without valid images
            except Exception as e:
                # Continue without figures rather than failing completely
                figures = []

            return {
                "html": html,
                "text": text,
                "figures": figures,
            }
        finally:
            # Clean up temp file
            if os.path.exists(tmp_path):
                os.unlink(tmp_path)

    except ImportError as e:
        print(f"⚠️ Docling import error: {e}, using placeholder")
        return {
            "html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>",
            "text": "Sample text from PDF.\n\nDocling not available - using placeholder.",
            "figures": [],
        }
    except Exception as e:
        print(f"⚠️ Docling parse error: {e}")
        import traceback
        traceback.print_exc()
        return {
            "html": f"<h1>Error</h1><pre>{str(e)}</pre>",
            "text": f"Error: {str(e)}",
            "figures": [],
        }