kwmin_probin / core /pdf_loader.py
cksleigen's picture
add files
c2f0e66
# core/pdf_loader.py
"""PDF ํ…์ŠคํŠธ ์ถ”์ถœ"""
import pymupdf4llm
import fitz # PyMuPDF
from typing import Dict, List
def load_pdf(pdf_path: str) -> Dict:
"""
PDF ํŒŒ์ผ ๋กœ๋“œ ๋ฐ ํ…์ŠคํŠธ ์ถ”์ถœ
Args:
pdf_path: PDF ํŒŒ์ผ ๊ฒฝ๋กœ
Returns:
Dict: {
"text": "์ „์ฒด ํ…์ŠคํŠธ",
"pages": [
{"page_num": 1, "text": "..."},
...
]
}
"""
print(f"๐Ÿ“„ PDF ๋กœ๋“œ ์ค‘: {pdf_path}")
# pymupdf4llm์œผ๋กœ ํ…์ŠคํŠธ ์ถ”์ถœ (markdown ํ˜•์‹)
md_text = pymupdf4llm.to_markdown(pdf_path)
# ํŽ˜์ด์ง€๋ณ„๋กœ ๋ถ„๋ฆฌ
doc = fitz.open(pdf_path)
pages = []
for page_num, page in enumerate(doc, start=1):
page_text = page.get_text()
pages.append({
"page_num": page_num,
"text": page_text
})
doc.close()
print(f"โœ… {len(pages)}ํŽ˜์ด์ง€ ์ถ”์ถœ ์™„๋ฃŒ")
return {
"text": md_text,
"pages": pages,
"total_pages": len(pages)
}
if __name__ == "__main__":
# ํ…Œ์ŠคํŠธ
import sys
if len(sys.argv) > 1:
result = load_pdf(sys.argv[1])
print(f"์ด ํŽ˜์ด์ง€: {result['total_pages']}")
print(f"์ฒซ ํŽ˜์ด์ง€ ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {result['pages'][0]['text'][:200]}")