bluewhale2025's picture
Initial commit: Add ParseAI document processor application
3022fd1
raw
history blame
1.93 kB
from PyPDF2 import PdfReader
from typing import Dict, List
import json
from datetime import datetime
import os
from pathlib import Path
class PDFExtractor:
def __init__(self):
pass
def extract_text(self, file_path: str) -> Dict:
"""PDF ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ"""
try:
# PDF ํŒŒ์ผ ์ฝ๊ธฐ
pdf_reader = PdfReader(file_path)
# ํŽ˜์ด์ง€ ์ˆ˜ ํ™•์ธ
total_pages = len(pdf_reader.pages)
# ํŽ˜์ด์ง€๋ณ„ ํ…์ŠคํŠธ ์ถ”์ถœ
text_by_page = []
for page_num, page in enumerate(pdf_reader.pages, 1):
text = page.extract_text()
if text:
text_by_page.append({
"page_number": page_num,
"text": text
})
# ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜
return {
"filename": os.path.basename(file_path),
"total_pages": total_pages,
"extracted_pages": len(text_by_page),
"text_by_page": text_by_page,
"timestamp": datetime.now().isoformat()
}
except Exception as e:
raise Exception(f"PDF ํ…์ŠคํŠธ ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
def save_extracted_text(self, extracted_data: Dict, output_dir: str) -> str:
"""์ถ”์ถœ๋œ ํ…์ŠคํŠธ๋ฅผ JSON ํŒŒ์ผ๋กœ ์ €์žฅ"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
filename = f"extracted_{extracted_data['filename'].split('.')[0]}.json"
output_path = output_dir / filename
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(extracted_data, f, ensure_ascii=False, indent=2)
return str(output_path)
# ์‹ฑ๊ธ€ํ†ค ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
pdf_extractor = PDFExtractor()