Spaces:
Runtime error
Runtime error
| # core/pdf_loader.py | |
| """PDF ํ ์คํธ ์ถ์ถ""" | |
| import pymupdf4llm | |
| import fitz # PyMuPDF | |
| from typing import Dict, List | |
| def load_pdf(pdf_path: str) -> Dict: | |
| """ | |
| PDF ํ์ผ ๋ก๋ ๋ฐ ํ ์คํธ ์ถ์ถ | |
| Args: | |
| pdf_path: PDF ํ์ผ ๊ฒฝ๋ก | |
| Returns: | |
| Dict: { | |
| "text": "์ ์ฒด ํ ์คํธ", | |
| "pages": [ | |
| {"page_num": 1, "text": "..."}, | |
| ... | |
| ] | |
| } | |
| """ | |
| print(f"๐ PDF ๋ก๋ ์ค: {pdf_path}") | |
| # pymupdf4llm์ผ๋ก ํ ์คํธ ์ถ์ถ (markdown ํ์) | |
| md_text = pymupdf4llm.to_markdown(pdf_path) | |
| # ํ์ด์ง๋ณ๋ก ๋ถ๋ฆฌ | |
| doc = fitz.open(pdf_path) | |
| pages = [] | |
| for page_num, page in enumerate(doc, start=1): | |
| page_text = page.get_text() | |
| pages.append({ | |
| "page_num": page_num, | |
| "text": page_text | |
| }) | |
| doc.close() | |
| print(f"โ {len(pages)}ํ์ด์ง ์ถ์ถ ์๋ฃ") | |
| return { | |
| "text": md_text, | |
| "pages": pages, | |
| "total_pages": len(pages) | |
| } | |
| if __name__ == "__main__": | |
| # ํ ์คํธ | |
| import sys | |
| if len(sys.argv) > 1: | |
| result = load_pdf(sys.argv[1]) | |
| print(f"์ด ํ์ด์ง: {result['total_pages']}") | |
| print(f"์ฒซ ํ์ด์ง ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {result['pages'][0]['text'][:200]}") | |