handbook-ocr-engine / app /services /text_extractor.py
internationalscholarsprogram's picture
Initial deploy: ISP Handbook OCR Engine
b12284c verified
"""Text extraction from digital (native-text) PDFs.
Uses PyMuPDF (fitz) for fast native text extraction and pdfplumber
for table detection on text-based pages.
"""
from __future__ import annotations
import logging
from pathlib import Path
import fitz # PyMuPDF
import pdfplumber
from app.schemas.extraction import (
BlockType,
ContentBlock,
DocumentMetadata,
HeadingLevel,
ListItem,
PageResult,
TableBlock,
TableCell,
)
logger = logging.getLogger(__name__)
# ── Heuristics ──
_HEADING_MIN_SIZE = 13.0 # font size threshold for headings
_LIST_BULLETS = {"•", "–", "-", "—", "○", "■", "□", "►", "▸", "●"}
def _is_heading(span: dict) -> bool:
"""Guess if a text span is a heading based on font size and weight."""
size = span.get("size", 12)
flags = span.get("flags", 0)
is_bold = bool(flags & 2 ** 4) # bit 4 = bold
return size >= _HEADING_MIN_SIZE or (is_bold and size >= 11.5)
def _heading_level(size: float) -> HeadingLevel:
if size >= 22:
return HeadingLevel.H1
if size >= 18:
return HeadingLevel.H2
if size >= 15:
return HeadingLevel.H3
if size >= 13:
return HeadingLevel.H4
return HeadingLevel.H5
def _is_list_line(line: str) -> bool:
stripped = line.strip()
if not stripped:
return False
# Bullet or numbered list
if stripped[0] in _LIST_BULLETS:
return True
# "1." or "a)" style
if len(stripped) >= 2 and stripped[0].isalnum() and stripped[1] in ".)" :
return True
return False
def _strip_bullet(line: str) -> str:
stripped = line.strip()
if stripped and stripped[0] in _LIST_BULLETS:
return stripped[1:].strip()
# "1." style
if len(stripped) >= 2 and stripped[0].isalnum() and stripped[1] in ".)":
return stripped[2:].strip()
return stripped
# ── Page text check ──
def page_has_native_text(pdf_path: str | Path, page_num: int) -> bool:
"""Return True if the page has enough native text to skip OCR."""
with fitz.open(str(pdf_path)) as doc:
if page_num >= len(doc):
return False
text = doc[page_num].get_text("text").strip()
return len(text) > 30 # arbitrary minimum
def document_has_native_text(pdf_path: str | Path) -> bool:
"""Quick check: does ANY page have substantial native text?"""
with fitz.open(str(pdf_path)) as doc:
for page in doc:
if len(page.get_text("text").strip()) > 30:
return True
return False
# ── Metadata ──
def extract_metadata(pdf_path: str | Path) -> DocumentMetadata:
p = Path(pdf_path)
with fitz.open(str(p)) as doc:
meta = doc.metadata or {}
return DocumentMetadata(
title=meta.get("title", "") or "",
author=meta.get("author", "") or "",
subject=meta.get("subject", "") or "",
creator=meta.get("creator", "") or "",
producer=meta.get("producer", "") or "",
page_count=len(doc),
file_name=p.name,
file_size_bytes=p.stat().st_size,
mime_type="application/pdf",
creation_date=meta.get("creationDate", "") or "",
modification_date=meta.get("modDate", "") or "",
)
# ── Structured text extraction (no OCR) ──
def extract_text_page(pdf_path: str | Path, page_num: int) -> PageResult:
"""Extract structured blocks from a native-text PDF page."""
blocks: list[ContentBlock] = []
with fitz.open(str(pdf_path)) as doc:
page = doc[page_num]
rect = page.rect
text_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)
current_paragraph_lines: list[str] = []
def flush_paragraph():
if current_paragraph_lines:
text = " ".join(current_paragraph_lines).strip()
if text:
# Check if entire paragraph is a list
lines = text.split("\n")
if all(_is_list_line(l) for l in lines if l.strip()):
items = [
ListItem(text=_strip_bullet(l))
for l in lines if l.strip()
]
blocks.append(ContentBlock(
block_type=BlockType.LIST,
list_items=items,
source="text",
))
else:
blocks.append(ContentBlock(
block_type=BlockType.PARAGRAPH,
text=text,
source="text",
))
current_paragraph_lines.clear()
for block_dict in text_dict.get("blocks", []):
if block_dict.get("type") != 0: # 0 = text block
continue
for line_dict in block_dict.get("lines", []):
spans = line_dict.get("spans", [])
if not spans:
continue
line_text = "".join(s.get("text", "") for s in spans).strip()
if not line_text:
flush_paragraph()
continue
# Check if this is a heading
first_span = spans[0]
if _is_heading(first_span):
flush_paragraph()
lvl = _heading_level(first_span.get("size", 12))
blocks.append(ContentBlock(
block_type=BlockType.HEADING,
text=line_text,
heading_level=lvl,
source="text",
))
elif _is_list_line(line_text):
flush_paragraph()
blocks.append(ContentBlock(
block_type=BlockType.LIST,
list_items=[ListItem(text=_strip_bullet(line_text))],
source="text",
))
else:
current_paragraph_lines.append(line_text)
flush_paragraph()
# Table detection via pdfplumber
_extract_tables_plumber(pdf_path, page_num, blocks)
plain = "\n".join(
b.text for b in blocks
if b.block_type in (BlockType.HEADING, BlockType.PARAGRAPH)
)
with fitz.open(str(pdf_path)) as doc:
rect = doc[page_num].rect
return PageResult(
page_number=page_num + 1, # 1-indexed for humans
width=rect.width,
height=rect.height,
blocks=blocks,
plain_text=plain,
is_scanned=False,
ocr_confidence=1.0,
)
def _extract_tables_plumber(
pdf_path: str | Path,
page_num: int,
blocks: list[ContentBlock],
) -> None:
"""Detect tables with pdfplumber and append TableBlock entries."""
try:
with pdfplumber.open(str(pdf_path)) as pdf:
if page_num >= len(pdf.pages):
return
page = pdf.pages[page_num]
tables = page.extract_tables()
for raw_table in tables:
if not raw_table:
continue
cells: list[TableCell] = []
n_rows = len(raw_table)
n_cols = max((len(r) for r in raw_table), default=0)
for ri, row in enumerate(raw_table):
for ci, val in enumerate(row or []):
cells.append(TableCell(
text=(val or "").strip(),
row=ri,
col=ci,
is_header=(ri == 0),
))
tb = TableBlock(rows=n_rows, cols=n_cols, cells=cells)
blocks.append(ContentBlock(
block_type=BlockType.TABLE,
table=tb,
source="text",
))
except Exception:
logger.warning("pdfplumber table extraction failed on page %d", page_num, exc_info=True)