Spaces:
Sleeping
Sleeping
| import pymupdf4llm | |
| from langchain.docstore.document import Document | |
| def ExtractDatafrompdf(pdf_path): | |
| """Extract PDF data using pymupdf4llm and return LangChain Documents.""" | |
| md_pages = pymupdf4llm.to_markdown( | |
| pdf_path, | |
| write_images=True, | |
| image_path="images", | |
| image_format="png", | |
| page_chunks=True | |
| ) | |
| print("First page structure:", md_pages[0]) | |
| documents = [] | |
| for page in md_pages: | |
| text = page["text"] | |
| page_num = md_pages.index(page) + 1 | |
| documents.append(Document(page_content=text, metadata={"page_number": page_num})) | |
| return documents | |