Spaces:

internationalscholarsprogram
/

handbook-ocr-engine

Sleeping

File size: 8,388 Bytes

b12284c

"""Text extraction from digital (native-text) PDFs.



Uses PyMuPDF (fitz) for fast native text extraction and pdfplumber

for table detection on text-based pages.

"""

from __future__ import annotations

import logging
from pathlib import Path

import fitz  # PyMuPDF
import pdfplumber

from app.schemas.extraction import (
    BlockType,
    ContentBlock,
    DocumentMetadata,
    HeadingLevel,
    ListItem,
    PageResult,
    TableBlock,
    TableCell,
)

logger = logging.getLogger(__name__)

# ── Heuristics ──

_HEADING_MIN_SIZE = 13.0  # font size threshold for headings
_LIST_BULLETS = {"•", "–", "-", "—", "○", "■", "□", "►", "▸", "●"}


def _is_heading(span: dict) -> bool:
    """Guess if a text span is a heading based on font size and weight."""
    size = span.get("size", 12)
    flags = span.get("flags", 0)
    is_bold = bool(flags & 2 ** 4)  # bit 4 = bold
    return size >= _HEADING_MIN_SIZE or (is_bold and size >= 11.5)


def _heading_level(size: float) -> HeadingLevel:
    if size >= 22:
        return HeadingLevel.H1
    if size >= 18:
        return HeadingLevel.H2
    if size >= 15:
        return HeadingLevel.H3
    if size >= 13:
        return HeadingLevel.H4
    return HeadingLevel.H5


def _is_list_line(line: str) -> bool:
    stripped = line.strip()
    if not stripped:
        return False
    # Bullet or numbered list
    if stripped[0] in _LIST_BULLETS:
        return True
    # "1." or "a)" style
    if len(stripped) >= 2 and stripped[0].isalnum() and stripped[1] in ".)" :
        return True
    return False


def _strip_bullet(line: str) -> str:
    stripped = line.strip()
    if stripped and stripped[0] in _LIST_BULLETS:
        return stripped[1:].strip()
    # "1." style
    if len(stripped) >= 2 and stripped[0].isalnum() and stripped[1] in ".)":
        return stripped[2:].strip()
    return stripped


# ── Page text check ──


def page_has_native_text(pdf_path: str | Path, page_num: int) -> bool:
    """Return True if the page has enough native text to skip OCR."""
    with fitz.open(str(pdf_path)) as doc:
        if page_num >= len(doc):
            return False
        text = doc[page_num].get_text("text").strip()
        return len(text) > 30  # arbitrary minimum


def document_has_native_text(pdf_path: str | Path) -> bool:
    """Quick check: does ANY page have substantial native text?"""
    with fitz.open(str(pdf_path)) as doc:
        for page in doc:
            if len(page.get_text("text").strip()) > 30:
                return True
    return False


# ── Metadata ──


def extract_metadata(pdf_path: str | Path) -> DocumentMetadata:
    p = Path(pdf_path)
    with fitz.open(str(p)) as doc:
        meta = doc.metadata or {}
        return DocumentMetadata(
            title=meta.get("title", "") or "",
            author=meta.get("author", "") or "",
            subject=meta.get("subject", "") or "",
            creator=meta.get("creator", "") or "",
            producer=meta.get("producer", "") or "",
            page_count=len(doc),
            file_name=p.name,
            file_size_bytes=p.stat().st_size,
            mime_type="application/pdf",
            creation_date=meta.get("creationDate", "") or "",
            modification_date=meta.get("modDate", "") or "",
        )


# ── Structured text extraction (no OCR) ──


def extract_text_page(pdf_path: str | Path, page_num: int) -> PageResult:
    """Extract structured blocks from a native-text PDF page."""

    blocks: list[ContentBlock] = []

    with fitz.open(str(pdf_path)) as doc:
        page = doc[page_num]
        rect = page.rect
        text_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)

        current_paragraph_lines: list[str] = []

        def flush_paragraph():
            if current_paragraph_lines:
                text = " ".join(current_paragraph_lines).strip()
                if text:
                    # Check if entire paragraph is a list
                    lines = text.split("\n")
                    if all(_is_list_line(l) for l in lines if l.strip()):
                        items = [
                            ListItem(text=_strip_bullet(l))
                            for l in lines if l.strip()
                        ]
                        blocks.append(ContentBlock(
                            block_type=BlockType.LIST,
                            list_items=items,
                            source="text",
                        ))
                    else:
                        blocks.append(ContentBlock(
                            block_type=BlockType.PARAGRAPH,
                            text=text,
                            source="text",
                        ))
                current_paragraph_lines.clear()

        for block_dict in text_dict.get("blocks", []):
            if block_dict.get("type") != 0:  # 0 = text block
                continue
            for line_dict in block_dict.get("lines", []):
                spans = line_dict.get("spans", [])
                if not spans:
                    continue

                line_text = "".join(s.get("text", "") for s in spans).strip()
                if not line_text:
                    flush_paragraph()
                    continue

                # Check if this is a heading
                first_span = spans[0]
                if _is_heading(first_span):
                    flush_paragraph()
                    lvl = _heading_level(first_span.get("size", 12))
                    blocks.append(ContentBlock(
                        block_type=BlockType.HEADING,
                        text=line_text,
                        heading_level=lvl,
                        source="text",
                    ))
                elif _is_list_line(line_text):
                    flush_paragraph()
                    blocks.append(ContentBlock(
                        block_type=BlockType.LIST,
                        list_items=[ListItem(text=_strip_bullet(line_text))],
                        source="text",
                    ))
                else:
                    current_paragraph_lines.append(line_text)

        flush_paragraph()

    # Table detection via pdfplumber
    _extract_tables_plumber(pdf_path, page_num, blocks)

    plain = "\n".join(
        b.text for b in blocks
        if b.block_type in (BlockType.HEADING, BlockType.PARAGRAPH)
    )

    with fitz.open(str(pdf_path)) as doc:
        rect = doc[page_num].rect

    return PageResult(
        page_number=page_num + 1,  # 1-indexed for humans
        width=rect.width,
        height=rect.height,
        blocks=blocks,
        plain_text=plain,
        is_scanned=False,
        ocr_confidence=1.0,
    )


def _extract_tables_plumber(

    pdf_path: str | Path,

    page_num: int,

    blocks: list[ContentBlock],

) -> None:
    """Detect tables with pdfplumber and append TableBlock entries."""
    try:
        with pdfplumber.open(str(pdf_path)) as pdf:
            if page_num >= len(pdf.pages):
                return
            page = pdf.pages[page_num]
            tables = page.extract_tables()
            for raw_table in tables:
                if not raw_table:
                    continue
                cells: list[TableCell] = []
                n_rows = len(raw_table)
                n_cols = max((len(r) for r in raw_table), default=0)
                for ri, row in enumerate(raw_table):
                    for ci, val in enumerate(row or []):
                        cells.append(TableCell(
                            text=(val or "").strip(),
                            row=ri,
                            col=ci,
                            is_header=(ri == 0),
                        ))
                tb = TableBlock(rows=n_rows, cols=n_cols, cells=cells)
                blocks.append(ContentBlock(
                    block_type=BlockType.TABLE,
                    table=tb,
                    source="text",
                ))
    except Exception:
        logger.warning("pdfplumber table extraction failed on page %d", page_num, exc_info=True)