File size: 2,868 Bytes
edac567
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import sys
import os
from typing import List, Dict, Optional

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))

try:
    from PyPDF2 import PdfReader
except ImportError:
    try:
        from pypdf import PdfReader
    except ImportError:
        print("Error: PDF reading library not found. Please install PyPDF2 or pypdf.")
        PdfReader = None

from src.utils.chunking import chunk_pdf_text, clean_text
from config.settings import Config

class PDFProcessor:
    """Process PDFs into cleaned text chunks."""

    def __init__(self, chunk_size: Optional[int] = None, overlap: Optional[int] = None) -> None:
        """
        Initialize processor with chunk parameters.

        Args:
            chunk_size: Characters per chunk (defaults to config).
            overlap: Overlap between chunks (defaults to config).
        """
        self.chunk_size = chunk_size or Config.CHUNK_SIZE
        self.overlap = overlap or Config.CHUNK_OVERLAP

    def process_pdf(self, file_path: str) -> List[str]:
        """
        Read PDF, extract text, clean, and chunk.

        Args:
            file_path: Path to PDF.

        Returns:
            List of chunk strings.
        """
        raw = self._extract_text(file_path)
        if not raw.strip():
            return []
        cleaned = clean_text(raw)
        chunks = chunk_pdf_text(cleaned, self.chunk_size, self.overlap)
        return [c for c in chunks if len(c.strip()) > 50]

    def get_pdf_info(self, file_path: str) -> Dict:
        """
        Retrieve simple info (pages, metadata, encryption).

        Args:
            file_path: Path to PDF.

        Returns:
            Dict of info.
        """
        try:
            reader = PdfReader(file_path)
            return {
                "num_pages": len(reader.pages),
                "metadata": reader.metadata,
                "encrypted": reader.is_encrypted,
            }
        except Exception as e:
            print(f"[PDFProcessor] Info error: {e}")
            return {}

    def _extract_text(self, file_path: str) -> str:
        """
        Extract text from all pages.

        Args:
            file_path: Path to PDF.

        Returns:
            Concatenated text with page separators.
        """
        try:
            reader = PdfReader(file_path)
            out: List[str] = []
            for idx, page in enumerate(reader.pages):
                try:
                    text = page.extract_text() or ""
                    if text.strip():
                        out.append(f"\n--- Page {idx+1} ---\n{text}")
                except Exception as pe:
                    print(f"[PDFProcessor] Page {idx+1} extraction failed: {pe}")
            return "".join(out)
        except Exception as e:
            print(f"[PDFProcessor] Read error: {e}")
            return ""