Spaces:

internationalscholarsprogram
/

handbook-ocr-engine

Sleeping

App Files Files Community

handbook-ocr-engine / app /services /text_extractor.py

internationalscholarsprogram

Initial deploy: ISP Handbook OCR Engine

b12284c verified about 1 month ago

raw

history blame contribute delete

8.39 kB

	"""Text extraction from digital (native-text) PDFs.

	Uses PyMuPDF (fitz) for fast native text extraction and pdfplumber
	for table detection on text-based pages.
	"""

	from __future__ import annotations

	import logging
	from pathlib import Path

	import fitz # PyMuPDF
	import pdfplumber

	from app.schemas.extraction import (
	BlockType,
	ContentBlock,
	DocumentMetadata,
	HeadingLevel,
	ListItem,
	PageResult,
	TableBlock,
	TableCell,
	)

	logger = logging.getLogger(__name__)

	# ── Heuristics ──

	_HEADING_MIN_SIZE = 13.0 # font size threshold for headings
	_LIST_BULLETS = {"•", "–", "-", "—", "○", "■", "□", "►", "▸", "●"}


	def _is_heading(span: dict) -> bool:
	"""Guess if a text span is a heading based on font size and weight."""
	size = span.get("size", 12)
	flags = span.get("flags", 0)
	is_bold = bool(flags & 2 ** 4) # bit 4 = bold
	return size >= _HEADING_MIN_SIZE or (is_bold and size >= 11.5)


	def _heading_level(size: float) -> HeadingLevel:
	if size >= 22:
	return HeadingLevel.H1
	if size >= 18:
	return HeadingLevel.H2
	if size >= 15:
	return HeadingLevel.H3
	if size >= 13:
	return HeadingLevel.H4
	return HeadingLevel.H5


	def _is_list_line(line: str) -> bool:
	stripped = line.strip()
	if not stripped:
	return False
	# Bullet or numbered list
	if stripped[0] in _LIST_BULLETS:
	return True
	# "1." or "a)" style
	if len(stripped) >= 2 and stripped[0].isalnum() and stripped[1] in ".)" :
	return True
	return False


	def _strip_bullet(line: str) -> str:
	stripped = line.strip()
	if stripped and stripped[0] in _LIST_BULLETS:
	return stripped[1:].strip()
	# "1." style
	if len(stripped) >= 2 and stripped[0].isalnum() and stripped[1] in ".)":
	return stripped[2:].strip()
	return stripped


	# ── Page text check ──


	def page_has_native_text(pdf_path: str \| Path, page_num: int) -> bool:
	"""Return True if the page has enough native text to skip OCR."""
	with fitz.open(str(pdf_path)) as doc:
	if page_num >= len(doc):
	return False
	text = doc[page_num].get_text("text").strip()
	return len(text) > 30 # arbitrary minimum


	def document_has_native_text(pdf_path: str \| Path) -> bool:
	"""Quick check: does ANY page have substantial native text?"""
	with fitz.open(str(pdf_path)) as doc:
	for page in doc:
	if len(page.get_text("text").strip()) > 30:
	return True
	return False


	# ── Metadata ──


	def extract_metadata(pdf_path: str \| Path) -> DocumentMetadata:
	p = Path(pdf_path)
	with fitz.open(str(p)) as doc:
	meta = doc.metadata or {}
	return DocumentMetadata(
	title=meta.get("title", "") or "",
	author=meta.get("author", "") or "",
	subject=meta.get("subject", "") or "",
	creator=meta.get("creator", "") or "",
	producer=meta.get("producer", "") or "",
	page_count=len(doc),
	file_name=p.name,
	file_size_bytes=p.stat().st_size,
	mime_type="application/pdf",
	creation_date=meta.get("creationDate", "") or "",
	modification_date=meta.get("modDate", "") or "",
	)


	# ── Structured text extraction (no OCR) ──


	def extract_text_page(pdf_path: str \| Path, page_num: int) -> PageResult:
	"""Extract structured blocks from a native-text PDF page."""

	blocks: list[ContentBlock] = []

	with fitz.open(str(pdf_path)) as doc:
	page = doc[page_num]
	rect = page.rect
	text_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)

	current_paragraph_lines: list[str] = []

	def flush_paragraph():
	if current_paragraph_lines:
	text = " ".join(current_paragraph_lines).strip()
	if text:
	# Check if entire paragraph is a list
	lines = text.split("\n")
	if all(_is_list_line(l) for l in lines if l.strip()):
	items = [
	ListItem(text=_strip_bullet(l))
	for l in lines if l.strip()
	]
	blocks.append(ContentBlock(
	block_type=BlockType.LIST,
	list_items=items,
	source="text",
	))
	else:
	blocks.append(ContentBlock(
	block_type=BlockType.PARAGRAPH,
	text=text,
	source="text",
	))
	current_paragraph_lines.clear()

	for block_dict in text_dict.get("blocks", []):
	if block_dict.get("type") != 0: # 0 = text block
	continue
	for line_dict in block_dict.get("lines", []):
	spans = line_dict.get("spans", [])
	if not spans:
	continue

	line_text = "".join(s.get("text", "") for s in spans).strip()
	if not line_text:
	flush_paragraph()
	continue

	# Check if this is a heading
	first_span = spans[0]
	if _is_heading(first_span):
	flush_paragraph()
	lvl = _heading_level(first_span.get("size", 12))
	blocks.append(ContentBlock(
	block_type=BlockType.HEADING,
	text=line_text,
	heading_level=lvl,
	source="text",
	))
	elif _is_list_line(line_text):
	flush_paragraph()
	blocks.append(ContentBlock(
	block_type=BlockType.LIST,
	list_items=[ListItem(text=_strip_bullet(line_text))],
	source="text",
	))
	else:
	current_paragraph_lines.append(line_text)

	flush_paragraph()

	# Table detection via pdfplumber
	_extract_tables_plumber(pdf_path, page_num, blocks)

	plain = "\n".join(
	b.text for b in blocks
	if b.block_type in (BlockType.HEADING, BlockType.PARAGRAPH)
	)

	with fitz.open(str(pdf_path)) as doc:
	rect = doc[page_num].rect

	return PageResult(
	page_number=page_num + 1, # 1-indexed for humans
	width=rect.width,
	height=rect.height,
	blocks=blocks,
	plain_text=plain,
	is_scanned=False,
	ocr_confidence=1.0,
	)


	def _extract_tables_plumber(
	pdf_path: str \| Path,
	page_num: int,
	blocks: list[ContentBlock],
	) -> None:
	"""Detect tables with pdfplumber and append TableBlock entries."""
	try:
	with pdfplumber.open(str(pdf_path)) as pdf:
	if page_num >= len(pdf.pages):
	return
	page = pdf.pages[page_num]
	tables = page.extract_tables()
	for raw_table in tables:
	if not raw_table:
	continue
	cells: list[TableCell] = []
	n_rows = len(raw_table)
	n_cols = max((len(r) for r in raw_table), default=0)
	for ri, row in enumerate(raw_table):
	for ci, val in enumerate(row or []):
	cells.append(TableCell(
	text=(val or "").strip(),
	row=ri,
	col=ci,
	is_header=(ri == 0),
	))
	tb = TableBlock(rows=n_rows, cols=n_cols, cells=cells)
	blocks.append(ContentBlock(
	block_type=BlockType.TABLE,
	table=tb,
	source="text",
	))
	except Exception:
	logger.warning("pdfplumber table extraction failed on page %d", page_num, exc_info=True)