File size: 3,585 Bytes
6a8a839
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from pathlib import Path
from uuid import uuid4

import fitz

from app.config import Settings
from app.schemas.document_verification import PdfAnalysis


class PdfProcessor:
    def __init__(self, settings: Settings) -> None:
        self.settings = settings

    def process(self, pdf_path: Path, max_pages: int) -> PdfAnalysis:
        warnings: list[str] = []
        flags: list[str] = []
        rendered_pages: list[str] = []
        page_texts: list[str] = []

        try:
            document = fitz.open(pdf_path)
        except Exception as exc:
            raise ValueError("PDF could not be opened safely.") from exc

        with document:
            is_encrypted = document.is_encrypted
            page_count = document.page_count
            pages_processed = min(page_count, max(0, max_pages))
            metadata = {str(key): str(value) for key, value in (document.metadata or {}).items()}

            if is_encrypted:
                warnings.append("PDF is encrypted; text extraction and rendering were skipped.")
                flags.append("encrypted_pdf")
                return PdfAnalysis(
                    checked=True,
                    is_pdf=True,
                    is_encrypted=True,
                    has_text_layer=False,
                    image_only_pdf=False,
                    page_count=page_count,
                    pages_processed=0,
                    pdf_text="",
                    page_texts=[],
                    rendered_pages=[],
                    raw_metadata=metadata,
                    structure_risk_score=0.4,
                    flags=flags,
                    warnings=warnings,
                )

            if page_count > pages_processed:
                warnings.append(f"PDF page processing truncated from {page_count} to {pages_processed} page(s).")
                flags.append("max_pages_truncated")

            for page_index in range(pages_processed):
                page = document.load_page(page_index)
                text = page.get_text("text").strip()
                page_texts.append(text)
                rendered_pages.append(self._render_page(page, page_index))

            pdf_text = "\n".join(text for text in page_texts if text).strip()
            has_text_layer = any(text.strip() for text in page_texts)
            image_only_pdf = page_count > 0 and not has_text_layer
            structure_risk_score = 0.0
            if image_only_pdf:
                flags.append("image_only_pdf")
                structure_risk_score = 0.2

            return PdfAnalysis(
                checked=True,
                is_pdf=True,
                is_encrypted=False,
                has_text_layer=has_text_layer,
                image_only_pdf=image_only_pdf,
                page_count=page_count,
                pages_processed=pages_processed,
                pdf_text=pdf_text,
                page_texts=page_texts,
                rendered_pages=rendered_pages,
                raw_metadata=metadata,
                structure_risk_score=structure_risk_score,
                flags=flags,
                warnings=warnings,
            )

    def _render_page(self, page: fitz.Page, page_index: int) -> str:
        self.settings.output_dir.mkdir(parents=True, exist_ok=True)
        zoom = 200 / 72
        matrix = fitz.Matrix(zoom, zoom)
        pixmap = page.get_pixmap(matrix=matrix, alpha=False)
        output_path = self.settings.output_dir / f"pdf_page_{page_index + 1}_{uuid4().hex}.png"
        pixmap.save(output_path)
        return str(output_path.resolve())