Spaces:
Runtime error
Runtime error
File size: 1,336 Bytes
c2f0e66 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | # core/pdf_loader.py
"""PDF ํ
์คํธ ์ถ์ถ"""
import pymupdf4llm
import fitz # PyMuPDF
from typing import Dict, List
def load_pdf(pdf_path: str) -> Dict:
"""
PDF ํ์ผ ๋ก๋ ๋ฐ ํ
์คํธ ์ถ์ถ
Args:
pdf_path: PDF ํ์ผ ๊ฒฝ๋ก
Returns:
Dict: {
"text": "์ ์ฒด ํ
์คํธ",
"pages": [
{"page_num": 1, "text": "..."},
...
]
}
"""
print(f"๐ PDF ๋ก๋ ์ค: {pdf_path}")
# pymupdf4llm์ผ๋ก ํ
์คํธ ์ถ์ถ (markdown ํ์)
md_text = pymupdf4llm.to_markdown(pdf_path)
# ํ์ด์ง๋ณ๋ก ๋ถ๋ฆฌ
doc = fitz.open(pdf_path)
pages = []
for page_num, page in enumerate(doc, start=1):
page_text = page.get_text()
pages.append({
"page_num": page_num,
"text": page_text
})
doc.close()
print(f"โ
{len(pages)}ํ์ด์ง ์ถ์ถ ์๋ฃ")
return {
"text": md_text,
"pages": pages,
"total_pages": len(pages)
}
if __name__ == "__main__":
# ํ
์คํธ
import sys
if len(sys.argv) > 1:
result = load_pdf(sys.argv[1])
print(f"์ด ํ์ด์ง: {result['total_pages']}")
print(f"์ฒซ ํ์ด์ง ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {result['pages'][0]['text'][:200]}")
|