Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| from uuid import uuid4 | |
| import fitz | |
| from app.config import Settings | |
| from app.schemas.document_verification import PdfAnalysis | |
| class PdfProcessor: | |
| def __init__(self, settings: Settings) -> None: | |
| self.settings = settings | |
| def process(self, pdf_path: Path, max_pages: int) -> PdfAnalysis: | |
| warnings: list[str] = [] | |
| flags: list[str] = [] | |
| rendered_pages: list[str] = [] | |
| page_texts: list[str] = [] | |
| try: | |
| document = fitz.open(pdf_path) | |
| except Exception as exc: | |
| raise ValueError("PDF could not be opened safely.") from exc | |
| with document: | |
| is_encrypted = document.is_encrypted | |
| page_count = document.page_count | |
| pages_processed = min(page_count, max(0, max_pages)) | |
| metadata = {str(key): str(value) for key, value in (document.metadata or {}).items()} | |
| if is_encrypted: | |
| warnings.append("PDF is encrypted; text extraction and rendering were skipped.") | |
| flags.append("encrypted_pdf") | |
| return PdfAnalysis( | |
| checked=True, | |
| is_pdf=True, | |
| is_encrypted=True, | |
| has_text_layer=False, | |
| image_only_pdf=False, | |
| page_count=page_count, | |
| pages_processed=0, | |
| pdf_text="", | |
| page_texts=[], | |
| rendered_pages=[], | |
| raw_metadata=metadata, | |
| structure_risk_score=0.4, | |
| flags=flags, | |
| warnings=warnings, | |
| ) | |
| if page_count > pages_processed: | |
| warnings.append(f"PDF page processing truncated from {page_count} to {pages_processed} page(s).") | |
| flags.append("max_pages_truncated") | |
| for page_index in range(pages_processed): | |
| page = document.load_page(page_index) | |
| text = page.get_text("text").strip() | |
| page_texts.append(text) | |
| rendered_pages.append(self._render_page(page, page_index)) | |
| pdf_text = "\n".join(text for text in page_texts if text).strip() | |
| has_text_layer = any(text.strip() for text in page_texts) | |
| image_only_pdf = page_count > 0 and not has_text_layer | |
| structure_risk_score = 0.0 | |
| if image_only_pdf: | |
| flags.append("image_only_pdf") | |
| structure_risk_score = 0.2 | |
| return PdfAnalysis( | |
| checked=True, | |
| is_pdf=True, | |
| is_encrypted=False, | |
| has_text_layer=has_text_layer, | |
| image_only_pdf=image_only_pdf, | |
| page_count=page_count, | |
| pages_processed=pages_processed, | |
| pdf_text=pdf_text, | |
| page_texts=page_texts, | |
| rendered_pages=rendered_pages, | |
| raw_metadata=metadata, | |
| structure_risk_score=structure_risk_score, | |
| flags=flags, | |
| warnings=warnings, | |
| ) | |
| def _render_page(self, page: fitz.Page, page_index: int) -> str: | |
| self.settings.output_dir.mkdir(parents=True, exist_ok=True) | |
| zoom = 200 / 72 | |
| matrix = fitz.Matrix(zoom, zoom) | |
| pixmap = page.get_pixmap(matrix=matrix, alpha=False) | |
| output_path = self.settings.output_dir / f"pdf_page_{page_index + 1}_{uuid4().hex}.png" | |
| pixmap.save(output_path) | |
| return str(output_path.resolve()) | |