text-extraction-api / extractors /pdf_extractor.py
krishnachoudhary-hclguvi
Deploy text extraction API files
52a0fe9 unverified
"""
PDF text extraction using PyMuPDF (fitz).
Extracts text with layout preservation and document metadata.
"""
import fitz # PyMuPDF
import time
import os
from models.schemas import ExtractionResult, DocumentMetadata
def extract_pdf(file_path: str) -> ExtractionResult:
"""Extract text and metadata from a PDF file."""
start_time = time.time()
try:
doc = fitz.open(file_path)
# Extract text from all pages with full layout preservation
pages_text = []
for page_num in range(len(doc)):
page = doc[page_num]
# "layout" mode preserves the physical positioning of text (columns, tables, etc.)
# This ensures the "pointer position" matches the original PDF look.
text = page.get_text("layout")
if text.strip():
pages_text.append(f"--- Page {page_num + 1} ---\n{text}")
full_text = "\n\n".join(pages_text)
# Extract metadata
meta = doc.metadata
metadata = DocumentMetadata(
title=meta.get("title", "") or os.path.basename(file_path),
author=meta.get("author", "") or "Unknown",
creation_date=meta.get("creationDate", ""),
modification_date=meta.get("modDate", ""),
page_count=len(doc),
word_count=len(full_text.split()) if full_text else 0,
character_count=len(full_text),
file_type="PDF",
extra={
"producer": meta.get("producer", ""),
"creator": meta.get("creator", ""),
"subject": meta.get("subject", ""),
"keywords": meta.get("keywords", ""),
"format": meta.get("format", ""),
"encryption": doc.is_encrypted,
}
)
doc.close()
elapsed = (time.time() - start_time) * 1000
if not full_text.strip():
return ExtractionResult(
raw_text="",
metadata=metadata,
success=False,
error_message="No extractable text found in PDF. The document may contain only images — try uploading as an image for OCR processing.",
extraction_time_ms=elapsed,
)
return ExtractionResult(
raw_text=full_text,
metadata=metadata,
success=True,
extraction_time_ms=elapsed,
)
except Exception as e:
elapsed = (time.time() - start_time) * 1000
return ExtractionResult(
raw_text="",
metadata=DocumentMetadata(file_type="PDF"),
success=False,
error_message=f"PDF extraction failed: {str(e)}",
extraction_time_ms=elapsed,
)