financial-rag-chatbot / services /pdf_processor.py
Claude
Add complete Financial RAG system with Metacognitive Agent
f6b05db unverified
"""
PDF ํŒŒ์ผ ์ฒ˜๋ฆฌ ๋ฐ ํ…์ŠคํŠธ ์ถ”์ถœ
"""
import os
from pathlib import Path
from typing import List, Dict, Optional
import PyPDF2
import pdfplumber
from loguru import logger
from tqdm import tqdm
class PDFProcessor:
"""PDF ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ์™€ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ๋ฅผ ์ถ”์ถœํ•˜๋Š” ํด๋ž˜์Šค"""
def __init__(self, pdf_directory: str):
"""
Args:
pdf_directory: PDF ํŒŒ์ผ๋“ค์ด ์žˆ๋Š” ๋””๋ ‰ํ† ๋ฆฌ ๊ฒฝ๋กœ
"""
self.pdf_directory = Path(pdf_directory)
self.processed_docs = []
def get_pdf_files(self) -> List[Path]:
"""๋””๋ ‰ํ† ๋ฆฌ์—์„œ ๋ชจ๋“  PDF ํŒŒ์ผ ์ฐพ๊ธฐ"""
if not self.pdf_directory.exists():
raise FileNotFoundError(f"Directory not found: {self.pdf_directory}")
pdf_files = list(self.pdf_directory.glob("**/*.pdf"))
logger.info(f"Found {len(pdf_files)} PDF files in {self.pdf_directory}")
return pdf_files
def extract_text_from_pdf(self, pdf_path: Path) -> Optional[Dict[str, any]]:
"""
๋‹จ์ผ PDF ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ
Args:
pdf_path: PDF ํŒŒ์ผ ๊ฒฝ๋กœ
Returns:
Dict with 'text', 'metadata', 'filename', 'page_count'
"""
try:
# pdfplumber๋ฅผ ์‚ฌ์šฉํ•œ ํ…์ŠคํŠธ ์ถ”์ถœ (๋” ์ •ํ™•ํ•จ)
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n\n"
# PyPDF2๋กœ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ถ”์ถœ
with open(pdf_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
metadata = pdf_reader.metadata if pdf_reader.metadata else {}
page_count = len(pdf_reader.pages)
return {
'text': text.strip(),
'metadata': {
'title': metadata.get('/Title', ''),
'author': metadata.get('/Author', ''),
'subject': metadata.get('/Subject', ''),
'creator': metadata.get('/Creator', ''),
},
'filename': pdf_path.name,
'filepath': str(pdf_path),
'page_count': page_count
}
except Exception as e:
logger.error(f"Error processing {pdf_path.name}: {str(e)}")
return None
def process_all_pdfs(self) -> List[Dict[str, any]]:
"""
๋ชจ๋“  PDF ํŒŒ์ผ ์ฒ˜๋ฆฌ
Returns:
List of dictionaries containing extracted text and metadata
"""
pdf_files = self.get_pdf_files()
self.processed_docs = []
logger.info(f"Processing {len(pdf_files)} PDF files...")
for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
doc_data = self.extract_text_from_pdf(pdf_path)
if doc_data and doc_data['text']: # ํ…์ŠคํŠธ๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ๋งŒ ์ถ”๊ฐ€
self.processed_docs.append(doc_data)
else:
logger.warning(f"No text extracted from {pdf_path.name}")
logger.info(f"Successfully processed {len(self.processed_docs)} PDFs")
return self.processed_docs
def get_statistics(self) -> Dict[str, any]:
"""์ฒ˜๋ฆฌ๋œ ๋ฌธ์„œ๋“ค์˜ ํ†ต๊ณ„ ์ •๋ณด"""
if not self.processed_docs:
return {}
total_pages = sum(doc['page_count'] for doc in self.processed_docs)
total_chars = sum(len(doc['text']) for doc in self.processed_docs)
return {
'total_documents': len(self.processed_docs),
'total_pages': total_pages,
'total_characters': total_chars,
'avg_pages_per_doc': total_pages / len(self.processed_docs),
'avg_chars_per_doc': total_chars / len(self.processed_docs),
}