Spaces:

sonuprasad23
/

hrbot

Sleeping

hrbot / src /document_processor /converter.py

Sonu Prasad

integrated dockling

3cf9b4f 2 months ago

2.66 kB

	"""PDF text extractor using PyMuPDF (lightweight alternative to Docling)."""

	from datetime import datetime
	from pathlib import Path
	from typing import Optional

	try:
	import fitz # PyMuPDF
	PYMUPDF_AVAILABLE = True
	except ImportError:
	PYMUPDF_AVAILABLE = False


	class PDFExtractor:
	"""Extracts text from PDF documents using PyMuPDF."""

	def __init__(self, output_dir: Optional[Path] = None):
	"""Initialize the extractor.

	Args:
	output_dir: Directory to store extracted text files.
	"""
	self.output_dir = output_dir or Path("data/extracted")
	self.output_dir.mkdir(parents=True, exist_ok=True)

	def extract_text(self, pdf_path: Path) -> dict:
	"""Extract text from a PDF file.

	Args:
	pdf_path: Path to the PDF file.

	Returns:
	Dict with 'success', 'text', 'page_count', and 'error' keys.
	"""
	pdf_path = Path(pdf_path).resolve()

	if not PYMUPDF_AVAILABLE:
	return {
	'success': False,
	'text': '',
	'page_count': 0,
	'error': 'PyMuPDF not installed'
	}

	if not pdf_path.exists():
	return {
	'success': False,
	'text': '',
	'page_count': 0,
	'error': f'File not found: {pdf_path}'
	}

	try:
	doc = fitz.open(pdf_path)
	text_parts = []

	for page_num, page in enumerate(doc):
	page_text = page.get_text()
	if page_text.strip():
	text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}")

	full_text = "\n\n".join(text_parts)
	page_count = len(doc)
	doc.close()

	# Save extracted text
	txt_path = self.output_dir / f"{pdf_path.stem}.txt"
	txt_path.write_text(full_text, encoding='utf-8')

	return {
	'success': True,
	'text': full_text,
	'page_count': page_count,
	'error': None
	}

	except Exception as e:
	return {
	'success': False,
	'text': '',
	'page_count': 0,
	'error': str(e)
	}

	def extract_batch(self, pdf_paths: list) -> list:
	"""Extract text from multiple PDFs.

	Args:
	pdf_paths: List of PDF file paths.

	Returns:
	List of extraction results.
	"""
	return [self.extract_text(pdf_path) for pdf_path in pdf_paths]