File size: 1,931 Bytes
3022fd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from PyPDF2 import PdfReader
from typing import Dict, List
import json
from datetime import datetime
import os
from pathlib import Path

class PDFExtractor:
    def __init__(self):
        pass
        
    def extract_text(self, file_path: str) -> Dict:
        """PDF ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ"""
        try:
            # PDF ํŒŒ์ผ ์ฝ๊ธฐ
            pdf_reader = PdfReader(file_path)
            
            # ํŽ˜์ด์ง€ ์ˆ˜ ํ™•์ธ
            total_pages = len(pdf_reader.pages)
            
            # ํŽ˜์ด์ง€๋ณ„ ํ…์ŠคํŠธ ์ถ”์ถœ
            text_by_page = []
            for page_num, page in enumerate(pdf_reader.pages, 1):
                text = page.extract_text()
                if text:
                    text_by_page.append({
                        "page_number": page_num,
                        "text": text
                    })
            
            # ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜
            return {
                "filename": os.path.basename(file_path),
                "total_pages": total_pages,
                "extracted_pages": len(text_by_page),
                "text_by_page": text_by_page,
                "timestamp": datetime.now().isoformat()
            }
            
        except Exception as e:
            raise Exception(f"PDF ํ…์ŠคํŠธ ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")

    def save_extracted_text(self, extracted_data: Dict, output_dir: str) -> str:
        """์ถ”์ถœ๋œ ํ…์ŠคํŠธ๋ฅผ JSON ํŒŒ์ผ๋กœ ์ €์žฅ"""
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        filename = f"extracted_{extracted_data['filename'].split('.')[0]}.json"
        output_path = output_dir / filename
        
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(extracted_data, f, ensure_ascii=False, indent=2)
        
        return str(output_path)

# ์‹ฑ๊ธ€ํ†ค ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
pdf_extractor = PDFExtractor()