Spaces:
Sleeping
Sleeping
File size: 2,753 Bytes
52a0fe9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | """
PDF text extraction using PyMuPDF (fitz).
Extracts text with layout preservation and document metadata.
"""
import fitz # PyMuPDF
import time
import os
from models.schemas import ExtractionResult, DocumentMetadata
def extract_pdf(file_path: str) -> ExtractionResult:
"""Extract text and metadata from a PDF file."""
start_time = time.time()
try:
doc = fitz.open(file_path)
# Extract text from all pages with full layout preservation
pages_text = []
for page_num in range(len(doc)):
page = doc[page_num]
# "layout" mode preserves the physical positioning of text (columns, tables, etc.)
# This ensures the "pointer position" matches the original PDF look.
text = page.get_text("layout")
if text.strip():
pages_text.append(f"--- Page {page_num + 1} ---\n{text}")
full_text = "\n\n".join(pages_text)
# Extract metadata
meta = doc.metadata
metadata = DocumentMetadata(
title=meta.get("title", "") or os.path.basename(file_path),
author=meta.get("author", "") or "Unknown",
creation_date=meta.get("creationDate", ""),
modification_date=meta.get("modDate", ""),
page_count=len(doc),
word_count=len(full_text.split()) if full_text else 0,
character_count=len(full_text),
file_type="PDF",
extra={
"producer": meta.get("producer", ""),
"creator": meta.get("creator", ""),
"subject": meta.get("subject", ""),
"keywords": meta.get("keywords", ""),
"format": meta.get("format", ""),
"encryption": doc.is_encrypted,
}
)
doc.close()
elapsed = (time.time() - start_time) * 1000
if not full_text.strip():
return ExtractionResult(
raw_text="",
metadata=metadata,
success=False,
error_message="No extractable text found in PDF. The document may contain only images — try uploading as an image for OCR processing.",
extraction_time_ms=elapsed,
)
return ExtractionResult(
raw_text=full_text,
metadata=metadata,
success=True,
extraction_time_ms=elapsed,
)
except Exception as e:
elapsed = (time.time() - start_time) * 1000
return ExtractionResult(
raw_text="",
metadata=DocumentMetadata(file_type="PDF"),
success=False,
error_message=f"PDF extraction failed: {str(e)}",
extraction_time_ms=elapsed,
)
|