Spaces:
Sleeping
Sleeping
| """ | |
| PDF ํ์ผ ์ฒ๋ฆฌ ๋ฐ ํ ์คํธ ์ถ์ถ | |
| """ | |
| import os | |
| from pathlib import Path | |
| from typing import List, Dict, Optional | |
| import PyPDF2 | |
| import pdfplumber | |
| from loguru import logger | |
| from tqdm import tqdm | |
| class PDFProcessor: | |
| """PDF ํ์ผ์์ ํ ์คํธ์ ๋ฉํ๋ฐ์ดํฐ๋ฅผ ์ถ์ถํ๋ ํด๋์ค""" | |
| def __init__(self, pdf_directory: str): | |
| """ | |
| Args: | |
| pdf_directory: PDF ํ์ผ๋ค์ด ์๋ ๋๋ ํ ๋ฆฌ ๊ฒฝ๋ก | |
| """ | |
| self.pdf_directory = Path(pdf_directory) | |
| self.processed_docs = [] | |
| def get_pdf_files(self) -> List[Path]: | |
| """๋๋ ํ ๋ฆฌ์์ ๋ชจ๋ PDF ํ์ผ ์ฐพ๊ธฐ""" | |
| if not self.pdf_directory.exists(): | |
| raise FileNotFoundError(f"Directory not found: {self.pdf_directory}") | |
| pdf_files = list(self.pdf_directory.glob("**/*.pdf")) | |
| logger.info(f"Found {len(pdf_files)} PDF files in {self.pdf_directory}") | |
| return pdf_files | |
| def extract_text_from_pdf(self, pdf_path: Path) -> Optional[Dict[str, any]]: | |
| """ | |
| ๋จ์ผ PDF ํ์ผ์์ ํ ์คํธ ์ถ์ถ | |
| Args: | |
| pdf_path: PDF ํ์ผ ๊ฒฝ๋ก | |
| Returns: | |
| Dict with 'text', 'metadata', 'filename', 'page_count' | |
| """ | |
| try: | |
| # pdfplumber๋ฅผ ์ฌ์ฉํ ํ ์คํธ ์ถ์ถ (๋ ์ ํํจ) | |
| with pdfplumber.open(pdf_path) as pdf: | |
| text = "" | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n\n" | |
| # PyPDF2๋ก ๋ฉํ๋ฐ์ดํฐ ์ถ์ถ | |
| with open(pdf_path, 'rb') as f: | |
| pdf_reader = PyPDF2.PdfReader(f) | |
| metadata = pdf_reader.metadata if pdf_reader.metadata else {} | |
| page_count = len(pdf_reader.pages) | |
| return { | |
| 'text': text.strip(), | |
| 'metadata': { | |
| 'title': metadata.get('/Title', ''), | |
| 'author': metadata.get('/Author', ''), | |
| 'subject': metadata.get('/Subject', ''), | |
| 'creator': metadata.get('/Creator', ''), | |
| }, | |
| 'filename': pdf_path.name, | |
| 'filepath': str(pdf_path), | |
| 'page_count': page_count | |
| } | |
| except Exception as e: | |
| logger.error(f"Error processing {pdf_path.name}: {str(e)}") | |
| return None | |
| def process_all_pdfs(self) -> List[Dict[str, any]]: | |
| """ | |
| ๋ชจ๋ PDF ํ์ผ ์ฒ๋ฆฌ | |
| Returns: | |
| List of dictionaries containing extracted text and metadata | |
| """ | |
| pdf_files = self.get_pdf_files() | |
| self.processed_docs = [] | |
| logger.info(f"Processing {len(pdf_files)} PDF files...") | |
| for pdf_path in tqdm(pdf_files, desc="Processing PDFs"): | |
| doc_data = self.extract_text_from_pdf(pdf_path) | |
| if doc_data and doc_data['text']: # ํ ์คํธ๊ฐ ์๋ ๊ฒฝ์ฐ๋ง ์ถ๊ฐ | |
| self.processed_docs.append(doc_data) | |
| else: | |
| logger.warning(f"No text extracted from {pdf_path.name}") | |
| logger.info(f"Successfully processed {len(self.processed_docs)} PDFs") | |
| return self.processed_docs | |
| def get_statistics(self) -> Dict[str, any]: | |
| """์ฒ๋ฆฌ๋ ๋ฌธ์๋ค์ ํต๊ณ ์ ๋ณด""" | |
| if not self.processed_docs: | |
| return {} | |
| total_pages = sum(doc['page_count'] for doc in self.processed_docs) | |
| total_chars = sum(len(doc['text']) for doc in self.processed_docs) | |
| return { | |
| 'total_documents': len(self.processed_docs), | |
| 'total_pages': total_pages, | |
| 'total_characters': total_chars, | |
| 'avg_pages_per_doc': total_pages / len(self.processed_docs), | |
| 'avg_chars_per_doc': total_chars / len(self.processed_docs), | |
| } | |