File size: 2,657 Bytes
3cf9b4f
eb353a2
 
 
 
 
3cf9b4f
 
 
 
 
eb353a2
 
3cf9b4f
 
eb353a2
 
3cf9b4f
eb353a2
 
3cf9b4f
eb353a2
3cf9b4f
eb353a2
3cf9b4f
 
 
eb353a2
 
 
 
 
3cf9b4f
eb353a2
 
 
3cf9b4f
 
 
 
 
 
 
 
eb353a2
3cf9b4f
 
 
 
 
 
eb353a2
 
3cf9b4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb353a2
 
3cf9b4f
 
 
 
 
 
 
 
 
eb353a2
 
3cf9b4f
eb353a2
 
3cf9b4f
eb353a2
3cf9b4f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""PDF text extractor using PyMuPDF (lightweight alternative to Docling)."""

from datetime import datetime
from pathlib import Path
from typing import Optional

try:
    import fitz  # PyMuPDF
    PYMUPDF_AVAILABLE = True
except ImportError:
    PYMUPDF_AVAILABLE = False


class PDFExtractor:
    """Extracts text from PDF documents using PyMuPDF."""

    def __init__(self, output_dir: Optional[Path] = None):
        """Initialize the extractor.

        Args:
            output_dir: Directory to store extracted text files.
        """
        self.output_dir = output_dir or Path("data/extracted")
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def extract_text(self, pdf_path: Path) -> dict:
        """Extract text from a PDF file.

        Args:
            pdf_path: Path to the PDF file.

        Returns:
            Dict with 'success', 'text', 'page_count', and 'error' keys.
        """
        pdf_path = Path(pdf_path).resolve()

        if not PYMUPDF_AVAILABLE:
            return {
                'success': False,
                'text': '',
                'page_count': 0,
                'error': 'PyMuPDF not installed'
            }

        if not pdf_path.exists():
            return {
                'success': False,
                'text': '',
                'page_count': 0,
                'error': f'File not found: {pdf_path}'
            }

        try:
            doc = fitz.open(pdf_path)
            text_parts = []
            
            for page_num, page in enumerate(doc):
                page_text = page.get_text()
                if page_text.strip():
                    text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}")
            
            full_text = "\n\n".join(text_parts)
            page_count = len(doc)
            doc.close()

            # Save extracted text
            txt_path = self.output_dir / f"{pdf_path.stem}.txt"
            txt_path.write_text(full_text, encoding='utf-8')

            return {
                'success': True,
                'text': full_text,
                'page_count': page_count,
                'error': None
            }

        except Exception as e:
            return {
                'success': False,
                'text': '',
                'page_count': 0,
                'error': str(e)
            }

    def extract_batch(self, pdf_paths: list) -> list:
        """Extract text from multiple PDFs.

        Args:
            pdf_paths: List of PDF file paths.

        Returns:
            List of extraction results.
        """
        return [self.extract_text(pdf_path) for pdf_path in pdf_paths]