Spaces:
Sleeping
Sleeping
File size: 3,967 Bytes
f6b05db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
"""
PDF ํ์ผ ์ฒ๋ฆฌ ๋ฐ ํ
์คํธ ์ถ์ถ
"""
import os
from pathlib import Path
from typing import List, Dict, Optional
import PyPDF2
import pdfplumber
from loguru import logger
from tqdm import tqdm
class PDFProcessor:
"""PDF ํ์ผ์์ ํ
์คํธ์ ๋ฉํ๋ฐ์ดํฐ๋ฅผ ์ถ์ถํ๋ ํด๋์ค"""
def __init__(self, pdf_directory: str):
"""
Args:
pdf_directory: PDF ํ์ผ๋ค์ด ์๋ ๋๋ ํ ๋ฆฌ ๊ฒฝ๋ก
"""
self.pdf_directory = Path(pdf_directory)
self.processed_docs = []
def get_pdf_files(self) -> List[Path]:
"""๋๋ ํ ๋ฆฌ์์ ๋ชจ๋ PDF ํ์ผ ์ฐพ๊ธฐ"""
if not self.pdf_directory.exists():
raise FileNotFoundError(f"Directory not found: {self.pdf_directory}")
pdf_files = list(self.pdf_directory.glob("**/*.pdf"))
logger.info(f"Found {len(pdf_files)} PDF files in {self.pdf_directory}")
return pdf_files
def extract_text_from_pdf(self, pdf_path: Path) -> Optional[Dict[str, any]]:
"""
๋จ์ผ PDF ํ์ผ์์ ํ
์คํธ ์ถ์ถ
Args:
pdf_path: PDF ํ์ผ ๊ฒฝ๋ก
Returns:
Dict with 'text', 'metadata', 'filename', 'page_count'
"""
try:
# pdfplumber๋ฅผ ์ฌ์ฉํ ํ
์คํธ ์ถ์ถ (๋ ์ ํํจ)
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n\n"
# PyPDF2๋ก ๋ฉํ๋ฐ์ดํฐ ์ถ์ถ
with open(pdf_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
metadata = pdf_reader.metadata if pdf_reader.metadata else {}
page_count = len(pdf_reader.pages)
return {
'text': text.strip(),
'metadata': {
'title': metadata.get('/Title', ''),
'author': metadata.get('/Author', ''),
'subject': metadata.get('/Subject', ''),
'creator': metadata.get('/Creator', ''),
},
'filename': pdf_path.name,
'filepath': str(pdf_path),
'page_count': page_count
}
except Exception as e:
logger.error(f"Error processing {pdf_path.name}: {str(e)}")
return None
def process_all_pdfs(self) -> List[Dict[str, any]]:
"""
๋ชจ๋ PDF ํ์ผ ์ฒ๋ฆฌ
Returns:
List of dictionaries containing extracted text and metadata
"""
pdf_files = self.get_pdf_files()
self.processed_docs = []
logger.info(f"Processing {len(pdf_files)} PDF files...")
for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
doc_data = self.extract_text_from_pdf(pdf_path)
if doc_data and doc_data['text']: # ํ
์คํธ๊ฐ ์๋ ๊ฒฝ์ฐ๋ง ์ถ๊ฐ
self.processed_docs.append(doc_data)
else:
logger.warning(f"No text extracted from {pdf_path.name}")
logger.info(f"Successfully processed {len(self.processed_docs)} PDFs")
return self.processed_docs
def get_statistics(self) -> Dict[str, any]:
"""์ฒ๋ฆฌ๋ ๋ฌธ์๋ค์ ํต๊ณ ์ ๋ณด"""
if not self.processed_docs:
return {}
total_pages = sum(doc['page_count'] for doc in self.processed_docs)
total_chars = sum(len(doc['text']) for doc in self.processed_docs)
return {
'total_documents': len(self.processed_docs),
'total_pages': total_pages,
'total_characters': total_chars,
'avg_pages_per_doc': total_pages / len(self.processed_docs),
'avg_chars_per_doc': total_chars / len(self.processed_docs),
}
|