File size: 1,336 Bytes
c2f0e66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# core/pdf_loader.py
"""PDF ํ…์ŠคํŠธ ์ถ”์ถœ"""
import pymupdf4llm
import fitz  # PyMuPDF
from typing import Dict, List


def load_pdf(pdf_path: str) -> Dict:
    """
    PDF ํŒŒ์ผ ๋กœ๋“œ ๋ฐ ํ…์ŠคํŠธ ์ถ”์ถœ
    
    Args:
        pdf_path: PDF ํŒŒ์ผ ๊ฒฝ๋กœ
    
    Returns:
        Dict: {
            "text": "์ „์ฒด ํ…์ŠคํŠธ",
            "pages": [
                {"page_num": 1, "text": "..."},
                ...
            ]
        }
    """
    print(f"๐Ÿ“„ PDF ๋กœ๋“œ ์ค‘: {pdf_path}")
    
    # pymupdf4llm์œผ๋กœ ํ…์ŠคํŠธ ์ถ”์ถœ (markdown ํ˜•์‹)
    md_text = pymupdf4llm.to_markdown(pdf_path)
    
    # ํŽ˜์ด์ง€๋ณ„๋กœ ๋ถ„๋ฆฌ
    doc = fitz.open(pdf_path)
    
    pages = []
    for page_num, page in enumerate(doc, start=1):
        page_text = page.get_text()
        pages.append({
            "page_num": page_num,
            "text": page_text
        })
    
    doc.close()
    
    print(f"โœ… {len(pages)}ํŽ˜์ด์ง€ ์ถ”์ถœ ์™„๋ฃŒ")
    
    return {
        "text": md_text,
        "pages": pages,
        "total_pages": len(pages)
    }


if __name__ == "__main__":
    # ํ…Œ์ŠคํŠธ
    import sys
    if len(sys.argv) > 1:
        result = load_pdf(sys.argv[1])
        print(f"์ด ํŽ˜์ด์ง€: {result['total_pages']}")
        print(f"์ฒซ ํŽ˜์ด์ง€ ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {result['pages'][0]['text'][:200]}")