parseai_202506_2 / extractor.py
bluewhale2025's picture
Add ParseAI application code
4d4b48e
from PyPDF2 import PdfReader
from typing import Dict, List
import json
from datetime import datetime
import os
from pathlib import Path
class PDFExtractor:
def __init__(self):
pass
def extract_text(self, file_path: str) -> Dict:
"""PDF νŒŒμΌμ—μ„œ ν…μŠ€νŠΈ μΆ”μΆœ"""
try:
# PDF 파일 읽기
pdf_reader = PdfReader(file_path)
# νŽ˜μ΄μ§€ 수 확인
total_pages = len(pdf_reader.pages)
# νŽ˜μ΄μ§€λ³„ ν…μŠ€νŠΈ μΆ”μΆœ
text_by_page = []
for page_num, page in enumerate(pdf_reader.pages, 1):
text = page.extract_text()
if text:
text_by_page.append({
"page_number": page_num,
"text": text
})
# κ²°κ³Ό λ°˜ν™˜
return {
"filename": os.path.basename(file_path),
"total_pages": total_pages,
"extracted_pages": len(text_by_page),
"text_by_page": text_by_page,
"timestamp": datetime.now().isoformat()
}
except Exception as e:
raise Exception(f"PDF ν…μŠ€νŠΈ μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
def save_extracted_text(self, extracted_data: Dict, output_dir: str) -> str:
"""μΆ”μΆœλœ ν…μŠ€νŠΈλ₯Ό JSON 파일둜 μ €μž₯"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
filename = f"extracted_{extracted_data['filename'].split('.')[0]}.json"
output_path = output_dir / filename
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(extracted_data, f, ensure_ascii=False, indent=2)
return str(output_path)
# 싱글톀 μΈμŠ€ν„΄μŠ€ 생성
pdf_extractor = PDFExtractor()