File size: 3,967 Bytes
f6b05db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
PDF ํŒŒ์ผ ์ฒ˜๋ฆฌ ๋ฐ ํ…์ŠคํŠธ ์ถ”์ถœ
"""
import os
from pathlib import Path
from typing import List, Dict, Optional
import PyPDF2
import pdfplumber
from loguru import logger
from tqdm import tqdm


class PDFProcessor:
    """PDF ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ์™€ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ๋ฅผ ์ถ”์ถœํ•˜๋Š” ํด๋ž˜์Šค"""

    def __init__(self, pdf_directory: str):
        """
        Args:
            pdf_directory: PDF ํŒŒ์ผ๋“ค์ด ์žˆ๋Š” ๋””๋ ‰ํ† ๋ฆฌ ๊ฒฝ๋กœ
        """
        self.pdf_directory = Path(pdf_directory)
        self.processed_docs = []

    def get_pdf_files(self) -> List[Path]:
        """๋””๋ ‰ํ† ๋ฆฌ์—์„œ ๋ชจ๋“  PDF ํŒŒ์ผ ์ฐพ๊ธฐ"""
        if not self.pdf_directory.exists():
            raise FileNotFoundError(f"Directory not found: {self.pdf_directory}")

        pdf_files = list(self.pdf_directory.glob("**/*.pdf"))
        logger.info(f"Found {len(pdf_files)} PDF files in {self.pdf_directory}")
        return pdf_files

    def extract_text_from_pdf(self, pdf_path: Path) -> Optional[Dict[str, any]]:
        """
        ๋‹จ์ผ PDF ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ

        Args:
            pdf_path: PDF ํŒŒ์ผ ๊ฒฝ๋กœ

        Returns:
            Dict with 'text', 'metadata', 'filename', 'page_count'
        """
        try:
            # pdfplumber๋ฅผ ์‚ฌ์šฉํ•œ ํ…์ŠคํŠธ ์ถ”์ถœ (๋” ์ •ํ™•ํ•จ)
            with pdfplumber.open(pdf_path) as pdf:
                text = ""
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n\n"

                # PyPDF2๋กœ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ถ”์ถœ
                with open(pdf_path, 'rb') as f:
                    pdf_reader = PyPDF2.PdfReader(f)
                    metadata = pdf_reader.metadata if pdf_reader.metadata else {}
                    page_count = len(pdf_reader.pages)

                return {
                    'text': text.strip(),
                    'metadata': {
                        'title': metadata.get('/Title', ''),
                        'author': metadata.get('/Author', ''),
                        'subject': metadata.get('/Subject', ''),
                        'creator': metadata.get('/Creator', ''),
                    },
                    'filename': pdf_path.name,
                    'filepath': str(pdf_path),
                    'page_count': page_count
                }

        except Exception as e:
            logger.error(f"Error processing {pdf_path.name}: {str(e)}")
            return None

    def process_all_pdfs(self) -> List[Dict[str, any]]:
        """
        ๋ชจ๋“  PDF ํŒŒ์ผ ์ฒ˜๋ฆฌ

        Returns:
            List of dictionaries containing extracted text and metadata
        """
        pdf_files = self.get_pdf_files()
        self.processed_docs = []

        logger.info(f"Processing {len(pdf_files)} PDF files...")

        for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
            doc_data = self.extract_text_from_pdf(pdf_path)
            if doc_data and doc_data['text']:  # ํ…์ŠคํŠธ๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ๋งŒ ์ถ”๊ฐ€
                self.processed_docs.append(doc_data)
            else:
                logger.warning(f"No text extracted from {pdf_path.name}")

        logger.info(f"Successfully processed {len(self.processed_docs)} PDFs")
        return self.processed_docs

    def get_statistics(self) -> Dict[str, any]:
        """์ฒ˜๋ฆฌ๋œ ๋ฌธ์„œ๋“ค์˜ ํ†ต๊ณ„ ์ •๋ณด"""
        if not self.processed_docs:
            return {}

        total_pages = sum(doc['page_count'] for doc in self.processed_docs)
        total_chars = sum(len(doc['text']) for doc in self.processed_docs)

        return {
            'total_documents': len(self.processed_docs),
            'total_pages': total_pages,
            'total_characters': total_chars,
            'avg_pages_per_doc': total_pages / len(self.processed_docs),
            'avg_chars_per_doc': total_chars / len(self.processed_docs),
        }