Spaces:

build-small-hackathon
/

FinSightAI

Sleeping

File size: 4,673 Bytes

7248d39

"""PyMuPDF-based PDF parsing utilities."""

from __future__ import annotations

from dataclasses import dataclass
from typing import List, Tuple

import fitz
from PIL import Image

SPARSE_TEXT_THRESHOLD = 100
_LINE_Y_TOLERANCE = 4.0
_SPACE_POINTS = 3.5


@dataclass
class PDFPage:
    page_number: int
    embedded_text: str
    image: Image.Image
    is_sparse: bool


def extract_pdf_pages(file_bytes: bytes, dpi_scale: float = 2.0) -> List[PDFPage]:
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    pages = []
    try:
        for page_num, page in enumerate(doc):
            embedded_text = page.get_text("text")
            mat = fitz.Matrix(dpi_scale, dpi_scale)
            pix = page.get_pixmap(matrix=mat)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            is_sparse = len(embedded_text.strip()) < SPARSE_TEXT_THRESHOLD
            pages.append(
                PDFPage(
                    page_number=page_num + 1,
                    embedded_text=embedded_text,
                    image=img,
                    is_sparse=is_sparse,
                )
            )
    finally:
        doc.close()
    return pages


def _group_blocks_into_lines(
    blocks: List[Tuple[float, float, float, str]],
) -> List[List[Tuple[float, float, str]]]:
    blocks.sort(key=lambda item: (round(item[0], 1), item[1]))
    lines: List[List[Tuple[float, float, str]]] = []
    current_y: float | None = None
    current_line: List[Tuple[float, float, str]] = []

    for y0, x0, x1, text in blocks:
        if current_y is None or abs(y0 - current_y) > _LINE_Y_TOLERANCE:
            if current_line:
                lines.append(current_line)
            current_line = [(x0, x1, text)]
            current_y = y0
        else:
            current_line.append((x0, x1, text))

    if current_line:
        lines.append(current_line)
    return lines


def extract_page_spatial_text(page: fitz.Page) -> str:
    """Rebuild page text with column spacing from native PDF text blocks."""
    raw_blocks = page.get_text("blocks")
    text_blocks: List[Tuple[float, float, float, str]] = []

    for block in raw_blocks:
        if block[6] != 0:
            continue
        x0, y0, x1, y1, text, *_ = block
        cleaned = text.replace("\n", " ").strip()
        if cleaned:
            text_blocks.append((y0, x0, x1, cleaned))

    if not text_blocks:
        return page.get_text("text", sort=True).strip()

    lines_out: List[str] = []
    for line_blocks in _group_blocks_into_lines(text_blocks):
        line_blocks.sort(key=lambda item: item[0])
        parts: List[str] = []
        cursor_x = 0.0

        for x0, x1, text in line_blocks:
            if parts:
                gap = max(1, int((x0 - cursor_x) / _SPACE_POINTS))
                parts.append(" " * gap)
            else:
                leading = max(0, int(x0 / _SPACE_POINTS))
                if leading:
                    parts.append(" " * leading)
            parts.append(text)
            cursor_x = x1

        lines_out.append("".join(parts).rstrip())

    return "\n".join(lines_out).strip()


def extract_pdf_spatial_pages(file_bytes: bytes) -> List[Tuple[int, str, bool]]:
    """Return (page_num, spatial_text, is_sparse) for each PDF page."""
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    pages: List[Tuple[int, str, bool]] = []
    try:
        for page_num, page in enumerate(doc, start=1):
            embedded = page.get_text("text").strip()
            is_sparse = len(embedded) < SPARSE_TEXT_THRESHOLD
            if is_sparse:
                pages.append((page_num, embedded, True))
            else:
                pages.append((page_num, extract_page_spatial_text(page), False))
    finally:
        doc.close()
    return pages


def render_page_image(
    file_bytes: bytes, page_num: int, dpi_scale: float = 2.0
) -> Image.Image:
    """Render a single PDF page — used only when chart OCR is needed."""
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    try:
        page = doc[page_num - 1]
        mat = fitz.Matrix(dpi_scale, dpi_scale)
        pix = page.get_pixmap(matrix=mat)
        return Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    finally:
        doc.close()


def render_page_png_base64(file_bytes: bytes, page_num: int = 1, dpi_scale: float = 2.0) -> str:
    import base64

    doc = fitz.open(stream=file_bytes, filetype="pdf")
    try:
        page = doc[page_num - 1]
        mat = fitz.Matrix(dpi_scale, dpi_scale)
        pix = page.get_pixmap(matrix=mat)
        return base64.b64encode(pix.tobytes("png")).decode("ascii")
    finally:
        doc.close()