Spaces:
Running
Running
| # Import required libraries | |
| import pandas as pd | |
| from pathlib import Path | |
| from typing import List | |
| from langchain.schema import Document | |
| from .config import logger | |
| from langchain_pymupdf4llm import PyMuPDF4LLMLoader | |
| from langchain_community.document_loaders.parsers import TesseractBlobParser | |
| def load_pdf_documents(pdf_path: Path) -> List[Document]: | |
| """ | |
| Load and process PDF documents from medical guidelines using PyMuPDF4LLMLoader. | |
| Uses Tesseract for image extraction and optimized table extraction for medical documents. | |
| Extracts disease and provider from directory structure. | |
| Directory structure expected: Data/Disease Name/Provider Name/file.pdf | |
| Args: | |
| pdf_path: Path to the PDF file | |
| Returns: | |
| List of Document objects | |
| """ | |
| try: | |
| # Validate file exists | |
| if not pdf_path.exists(): | |
| raise FileNotFoundError(f"PDF file not found at {pdf_path}") | |
| # Extract disease and provider from directory structure | |
| path_parts = pdf_path.parts | |
| disease = "unknown" | |
| provider = "unknown" | |
| if len(path_parts) >= 3: | |
| # Get disease (parent's parent directory) | |
| disease = path_parts[-3] if path_parts[-3].lower() != "data" else path_parts[-2] | |
| # Get provider (parent directory) | |
| provider = path_parts[-2] | |
| # Initialize PyMuPDF4LLMLoader | |
| loader = PyMuPDF4LLMLoader( | |
| str(pdf_path), | |
| mode="page", | |
| extract_images=True, | |
| images_parser=TesseractBlobParser(), | |
| table_strategy="lines" | |
| ) | |
| raw_documents = loader.load() | |
| documents = [] | |
| for idx, doc in enumerate(raw_documents): | |
| if doc.page_content.strip(): | |
| # Extract actual page number from metadata, default to sequential numbering | |
| # PyMuPDF4LLMLoader uses 0-indexed pages, so we add 1 for human-readable page numbers | |
| actual_page = doc.metadata.get("page") | |
| if actual_page is not None: | |
| # If page is 0-indexed, add 1 to make it 1-indexed | |
| page_num = actual_page + 1 if actual_page == idx else actual_page | |
| else: | |
| # Fallback to 1-indexed sequential numbering | |
| page_num = idx + 1 | |
| processed_doc = Document( | |
| page_content=doc.page_content, | |
| metadata={ | |
| "source": pdf_path.stem, | |
| "disease": disease, | |
| "provider": provider, | |
| "page_number": page_num | |
| } | |
| ) | |
| documents.append(processed_doc) | |
| logger.info(f"Loaded {len(documents)} document pages from PDF - Disease: {disease}, Provider: {provider}") | |
| return documents | |
| except Exception as e: | |
| logger.error(f"Error loading PDF documents: {str(e)}") | |
| raise | |
| def load_markdown_documents(md_path: Path) -> List[Document]: | |
| """ | |
| Load and process Markdown medical guidelines. | |
| Extracts disease and provider from directory structure. | |
| Directory structure expected: Data/Disease Name/Provider Name/file.md | |
| Args: | |
| md_path: Path to the Markdown file | |
| Returns: | |
| List of Document objects (single document split by sections if needed) | |
| """ | |
| try: | |
| # Validate file exists | |
| if not md_path.exists(): | |
| raise FileNotFoundError(f"Markdown file not found at {md_path}") | |
| # Extract disease and provider from directory structure | |
| path_parts = md_path.parts | |
| disease = "unknown" | |
| provider = "unknown" | |
| if len(path_parts) >= 3: | |
| # Get disease (parent's parent directory) | |
| disease = path_parts[-3] if path_parts[-3].lower() != "data" else path_parts[-2] | |
| # Get provider (parent directory) | |
| provider = path_parts[-2] | |
| # Read markdown content | |
| with open(md_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # Create document with minimal metadata for RAG | |
| doc = Document( | |
| page_content=content, | |
| metadata={ | |
| "source": md_path.stem, | |
| "disease": disease, | |
| "provider": provider, | |
| "page_number": 1 | |
| } | |
| ) | |
| logger.info(f"Loaded Markdown document - Disease: {disease}, Provider: {provider}") | |
| return [doc] | |
| except Exception as e: | |
| logger.error(f"Error loading Markdown document: {str(e)}") | |
| raise | |