Spaces:
Build error
Build error
File size: 1,931 Bytes
3022fd1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
from PyPDF2 import PdfReader
from typing import Dict, List
import json
from datetime import datetime
import os
from pathlib import Path
class PDFExtractor:
def __init__(self):
pass
def extract_text(self, file_path: str) -> Dict:
"""PDF ํ์ผ์์ ํ
์คํธ ์ถ์ถ"""
try:
# PDF ํ์ผ ์ฝ๊ธฐ
pdf_reader = PdfReader(file_path)
# ํ์ด์ง ์ ํ์ธ
total_pages = len(pdf_reader.pages)
# ํ์ด์ง๋ณ ํ
์คํธ ์ถ์ถ
text_by_page = []
for page_num, page in enumerate(pdf_reader.pages, 1):
text = page.extract_text()
if text:
text_by_page.append({
"page_number": page_num,
"text": text
})
# ๊ฒฐ๊ณผ ๋ฐํ
return {
"filename": os.path.basename(file_path),
"total_pages": total_pages,
"extracted_pages": len(text_by_page),
"text_by_page": text_by_page,
"timestamp": datetime.now().isoformat()
}
except Exception as e:
raise Exception(f"PDF ํ
์คํธ ์ถ์ถ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}")
def save_extracted_text(self, extracted_data: Dict, output_dir: str) -> str:
"""์ถ์ถ๋ ํ
์คํธ๋ฅผ JSON ํ์ผ๋ก ์ ์ฅ"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
filename = f"extracted_{extracted_data['filename'].split('.')[0]}.json"
output_path = output_dir / filename
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(extracted_data, f, ensure_ascii=False, indent=2)
return str(output_path)
# ์ฑ๊ธํค ์ธ์คํด์ค ์์ฑ
pdf_extractor = PDFExtractor()
|