File size: 2,753 Bytes
52a0fe9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
PDF text extraction using PyMuPDF (fitz).
Extracts text with layout preservation and document metadata.
"""
import fitz  # PyMuPDF
import time
import os
from models.schemas import ExtractionResult, DocumentMetadata


def extract_pdf(file_path: str) -> ExtractionResult:
    """Extract text and metadata from a PDF file."""
    start_time = time.time()

    try:
        doc = fitz.open(file_path)

        # Extract text from all pages with full layout preservation
        pages_text = []
        for page_num in range(len(doc)):
            page = doc[page_num]
            # "layout" mode preserves the physical positioning of text (columns, tables, etc.)
            # This ensures the "pointer position" matches the original PDF look.
            text = page.get_text("layout")
            if text.strip():
                pages_text.append(f"--- Page {page_num + 1} ---\n{text}")

        full_text = "\n\n".join(pages_text)

        # Extract metadata
        meta = doc.metadata
        metadata = DocumentMetadata(
            title=meta.get("title", "") or os.path.basename(file_path),
            author=meta.get("author", "") or "Unknown",
            creation_date=meta.get("creationDate", ""),
            modification_date=meta.get("modDate", ""),
            page_count=len(doc),
            word_count=len(full_text.split()) if full_text else 0,
            character_count=len(full_text),
            file_type="PDF",
            extra={
                "producer": meta.get("producer", ""),
                "creator": meta.get("creator", ""),
                "subject": meta.get("subject", ""),
                "keywords": meta.get("keywords", ""),
                "format": meta.get("format", ""),
                "encryption": doc.is_encrypted,
            }
        )

        doc.close()

        elapsed = (time.time() - start_time) * 1000

        if not full_text.strip():
            return ExtractionResult(
                raw_text="",
                metadata=metadata,
                success=False,
                error_message="No extractable text found in PDF. The document may contain only images — try uploading as an image for OCR processing.",
                extraction_time_ms=elapsed,
            )

        return ExtractionResult(
            raw_text=full_text,
            metadata=metadata,
            success=True,
            extraction_time_ms=elapsed,
        )

    except Exception as e:
        elapsed = (time.time() - start_time) * 1000
        return ExtractionResult(
            raw_text="",
            metadata=DocumentMetadata(file_type="PDF"),
            success=False,
            error_message=f"PDF extraction failed: {str(e)}",
            extraction_time_ms=elapsed,
        )