Spaces:

internationalscholarsprogram
/

handbook-ocr-engine

Sleeping

File size: 7,529 Bytes

b12284c

"""OCR extraction for scanned / image-based PDF pages.



Uses pytesseract with OpenCV preprocessing. Falls back gracefully

if Tesseract is not installed.

"""

from __future__ import annotations

import logging
import re
from pathlib import Path

import fitz  # PyMuPDF — to rasterize pages
from PIL import Image

from app.schemas.extraction import (
    BlockType,
    ContentBlock,
    HeadingLevel,
    ListItem,
    PageResult,
    TableBlock,
    TableCell,
)
from app.services.preprocessing import (
    is_mostly_blank,
    preprocess_for_ocr,
)

logger = logging.getLogger(__name__)

# ── Tesseract availability ──

_TESSERACT_AVAILABLE = False
try:
    import pytesseract

    # Quick sanity check
    pytesseract.get_tesseract_version()
    _TESSERACT_AVAILABLE = True
except Exception:
    logger.warning("pytesseract / Tesseract not available; OCR will be disabled")


def tesseract_available() -> bool:
    return _TESSERACT_AVAILABLE


def configure_tesseract(cmd: str) -> None:
    """Override the tesseract binary path at runtime."""
    global _TESSERACT_AVAILABLE
    if cmd:
        import pytesseract as _pt
        _pt.pytesseract.tesseract_cmd = cmd
        try:
            _pt.get_tesseract_version()
            _TESSERACT_AVAILABLE = True
        except Exception:
            _TESSERACT_AVAILABLE = False


# ── Rasterize a PDF page to PIL Image ──


def rasterize_page(pdf_path: str | Path, page_num: int, dpi: int = 300) -> Image.Image:
    """Render a PDF page to a PIL Image at the given DPI."""
    with fitz.open(str(pdf_path)) as doc:
        page = doc[page_num]
        zoom = dpi / 72.0
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        return Image.frombytes("RGB", (pix.width, pix.height), pix.samples)


# ── OCR a single page ──

_HEADING_PATTERN = re.compile(
    r"^(?:chapter|section|part|article)\s+\d+",
    re.IGNORECASE,
)
_LIST_PATTERN = re.compile(r"^\s*(?:[•\-–—○■□►▸●]|\d+[.)]\s|[a-z][.)]\s)", re.IGNORECASE)


def _classify_ocr_lines(lines: list[str]) -> list[ContentBlock]:
    """Heuristic classification of OCR text lines into content blocks."""
    blocks: list[ContentBlock] = []
    para_lines: list[str] = []

    def flush():
        if para_lines:
            text = " ".join(para_lines).strip()
            if text:
                blocks.append(ContentBlock(
                    block_type=BlockType.PARAGRAPH,
                    text=text,
                    source="ocr",
                ))
            para_lines.clear()

    for line in lines:
        stripped = line.strip()
        if not stripped:
            flush()
            continue

        # All-caps short line or "Chapter N" pattern → heading
        if (
            (len(stripped) < 80 and stripped == stripped.upper() and len(stripped) > 3)
            or _HEADING_PATTERN.match(stripped)
        ):
            flush()
            blocks.append(ContentBlock(
                block_type=BlockType.HEADING,
                text=stripped,
                heading_level=HeadingLevel.H2,
                source="ocr",
            ))
            continue

        if _LIST_PATTERN.match(stripped):
            flush()
            # Strip bullet character
            clean = re.sub(r"^\s*[•\-–—○■□►▸●]\s*", "", stripped)
            clean = re.sub(r"^\s*\d+[.)]\s*", "", clean) or stripped
            blocks.append(ContentBlock(
                block_type=BlockType.LIST,
                list_items=[ListItem(text=clean)],
                source="ocr",
            ))
            continue

        para_lines.append(stripped)

    flush()
    return blocks


def ocr_page(

    pdf_path: str | Path,

    page_num: int,

    dpi: int = 300,

    lang: str = "eng",

) -> PageResult:
    """Run OCR on a single PDF page and return structured blocks."""
    if not _TESSERACT_AVAILABLE:
        return PageResult(
            page_number=page_num + 1,
            blocks=[ContentBlock(
                block_type=BlockType.PARAGRAPH,
                text="[OCR unavailable — Tesseract not installed]",
                source="ocr",
                confidence=0.0,
            )],
            plain_text="[OCR unavailable]",
            is_scanned=True,
            ocr_confidence=0.0,
        )

    import pytesseract

    image = rasterize_page(pdf_path, page_num, dpi)

    if is_mostly_blank(image):
        return PageResult(
            page_number=page_num + 1,
            is_scanned=True,
            ocr_confidence=1.0,
            plain_text="",
            blocks=[],
        )

    # Preprocess for OCR
    processed = preprocess_for_ocr(image)

    # Run Tesseract with full data for confidence scores
    ocr_data = pytesseract.image_to_data(
        processed, lang=lang, output_type=pytesseract.Output.DICT,
    )

    # Compute average confidence (skip -1 entries)
    confs = [c for c in ocr_data.get("conf", []) if isinstance(c, (int, float)) and c >= 0]
    avg_conf = sum(confs) / len(confs) / 100.0 if confs else 0.0

    # Also get plain text
    plain_text = pytesseract.image_to_string(processed, lang=lang).strip()
    lines = plain_text.split("\n")
    blocks = _classify_ocr_lines(lines)

    # Set confidence on all blocks
    for b in blocks:
        b.confidence = avg_conf

    # Attempt table detection via Tesseract TSV data
    table_blocks = _detect_tables_from_ocr(ocr_data)
    blocks.extend(table_blocks)

    with fitz.open(str(pdf_path)) as doc:
        rect = doc[page_num].rect

    return PageResult(
        page_number=page_num + 1,
        width=rect.width,
        height=rect.height,
        blocks=blocks,
        plain_text=plain_text,
        is_scanned=True,
        ocr_confidence=round(avg_conf, 3),
    )


def _detect_tables_from_ocr(ocr_data: dict) -> list[ContentBlock]:
    """Basic table detection from OCR bounding-box alignment.



    Groups words by similar y-coordinates (rows) and x-gaps (columns).

    This is a heuristic — it won't catch every table but gives a

    reasonable first pass.

    """
    blocks: list[ContentBlock] = []

    tops = ocr_data.get("top", [])
    lefts = ocr_data.get("left", [])
    widths = ocr_data.get("width", [])
    heights = ocr_data.get("height", [])
    texts = ocr_data.get("text", [])

    if not tops or len(tops) < 4:
        return blocks

    # Group by block_num
    block_nums = ocr_data.get("block_num", [])
    line_nums = ocr_data.get("line_num", [])
    word_nums = ocr_data.get("word_num", [])

    # Build lines: group words with same (block, line)
    line_groups: dict[tuple[int, int], list[dict]] = {}
    for i in range(len(texts)):
        txt = (texts[i] or "").strip()
        if not txt:
            continue
        key = (block_nums[i], line_nums[i])
        line_groups.setdefault(key, []).append({
            "text": txt,
            "left": lefts[i],
            "top": tops[i],
            "width": widths[i],
            "height": heights[i],
        })

    # Look for blocks where multiple lines have consistent tab-stop alignment
    # (indicates tabular layout). This is a simplified heuristic.
    # For production, consider using opencv line-detection on the original image.

    return blocks