bluewhale2025's picture
Initial commit: Add ParseAI document processor application
3022fd1
from PyPDF2 import PdfReader
from typing import Dict, List
import json
from datetime import datetime
import os
from pathlib import Path
class PDFExtractor:
def __init__(self):
pass
def extract_text(self, file_path: str) -> Dict:
"""PDF ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ"""
try:
# PDF ํŒŒ์ผ ์ฝ๊ธฐ
pdf_reader = PdfReader(file_path)
# ํŽ˜์ด์ง€ ์ˆ˜ ํ™•์ธ
total_pages = len(pdf_reader.pages)
# ํŽ˜์ด์ง€๋ณ„ ํ…์ŠคํŠธ ์ถ”์ถœ
text_by_page = []
for page_num, page in enumerate(pdf_reader.pages, 1):
text = page.extract_text()
if text:
text_by_page.append({
"page_number": page_num,
"text": text
})
# ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜
return {
"filename": os.path.basename(file_path),
"total_pages": total_pages,
"extracted_pages": len(text_by_page),
"text_by_page": text_by_page,
"timestamp": datetime.now().isoformat()
}
except Exception as e:
raise Exception(f"PDF ํ…์ŠคํŠธ ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
def save_extracted_text(self, extracted_data: Dict, output_dir: str) -> str:
"""์ถ”์ถœ๋œ ํ…์ŠคํŠธ๋ฅผ JSON ํŒŒ์ผ๋กœ ์ €์žฅ"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
filename = f"extracted_{extracted_data['filename'].split('.')[0]}.json"
output_path = output_dir / filename
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(extracted_data, f, ensure_ascii=False, indent=2)
return str(output_path)
# ์‹ฑ๊ธ€ํ†ค ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
pdf_extractor = PDFExtractor()