Spaces:
Sleeping
Sleeping
| """ | |
| PDF text extraction using PyMuPDF (fitz). | |
| Extracts text with layout preservation and document metadata. | |
| """ | |
| import fitz # PyMuPDF | |
| import time | |
| import os | |
| from models.schemas import ExtractionResult, DocumentMetadata | |
| def extract_pdf(file_path: str) -> ExtractionResult: | |
| """Extract text and metadata from a PDF file.""" | |
| start_time = time.time() | |
| try: | |
| doc = fitz.open(file_path) | |
| # Extract text from all pages with full layout preservation | |
| pages_text = [] | |
| for page_num in range(len(doc)): | |
| page = doc[page_num] | |
| # "layout" mode preserves the physical positioning of text (columns, tables, etc.) | |
| # This ensures the "pointer position" matches the original PDF look. | |
| text = page.get_text("layout") | |
| if text.strip(): | |
| pages_text.append(f"--- Page {page_num + 1} ---\n{text}") | |
| full_text = "\n\n".join(pages_text) | |
| # Extract metadata | |
| meta = doc.metadata | |
| metadata = DocumentMetadata( | |
| title=meta.get("title", "") or os.path.basename(file_path), | |
| author=meta.get("author", "") or "Unknown", | |
| creation_date=meta.get("creationDate", ""), | |
| modification_date=meta.get("modDate", ""), | |
| page_count=len(doc), | |
| word_count=len(full_text.split()) if full_text else 0, | |
| character_count=len(full_text), | |
| file_type="PDF", | |
| extra={ | |
| "producer": meta.get("producer", ""), | |
| "creator": meta.get("creator", ""), | |
| "subject": meta.get("subject", ""), | |
| "keywords": meta.get("keywords", ""), | |
| "format": meta.get("format", ""), | |
| "encryption": doc.is_encrypted, | |
| } | |
| ) | |
| doc.close() | |
| elapsed = (time.time() - start_time) * 1000 | |
| if not full_text.strip(): | |
| return ExtractionResult( | |
| raw_text="", | |
| metadata=metadata, | |
| success=False, | |
| error_message="No extractable text found in PDF. The document may contain only images — try uploading as an image for OCR processing.", | |
| extraction_time_ms=elapsed, | |
| ) | |
| return ExtractionResult( | |
| raw_text=full_text, | |
| metadata=metadata, | |
| success=True, | |
| extraction_time_ms=elapsed, | |
| ) | |
| except Exception as e: | |
| elapsed = (time.time() - start_time) * 1000 | |
| return ExtractionResult( | |
| raw_text="", | |
| metadata=DocumentMetadata(file_type="PDF"), | |
| success=False, | |
| error_message=f"PDF extraction failed: {str(e)}", | |
| extraction_time_ms=elapsed, | |
| ) | |