|
|
|
|
|
import pandas as pd |
|
|
from pathlib import Path |
|
|
from typing import List |
|
|
from langchain.schema import Document |
|
|
from .config import logger |
|
|
from langchain_pymupdf4llm import PyMuPDF4LLMLoader |
|
|
from langchain_community.document_loaders.parsers import TesseractBlobParser |
|
|
|
|
|
|
|
|
def load_pdf_documents(pdf_path: Path) -> List[Document]: |
|
|
""" |
|
|
Load and process PDF documents from medical guidelines using PyMuPDF4LLMLoader. |
|
|
Uses Tesseract for image extraction and optimized table extraction for medical documents. |
|
|
Extracts disease and provider from directory structure. |
|
|
|
|
|
Directory structure expected: Data/Disease Name/Provider Name/file.pdf |
|
|
|
|
|
Args: |
|
|
pdf_path: Path to the PDF file |
|
|
|
|
|
Returns: |
|
|
List of Document objects |
|
|
""" |
|
|
try: |
|
|
|
|
|
|
|
|
if not pdf_path.exists(): |
|
|
raise FileNotFoundError(f"PDF file not found at {pdf_path}") |
|
|
|
|
|
|
|
|
path_parts = pdf_path.parts |
|
|
disease = "unknown" |
|
|
provider = "unknown" |
|
|
|
|
|
if len(path_parts) >= 3: |
|
|
|
|
|
disease = path_parts[-3] if path_parts[-3].lower() != "data" else path_parts[-2] |
|
|
|
|
|
provider = path_parts[-2] |
|
|
|
|
|
|
|
|
loader = PyMuPDF4LLMLoader( |
|
|
str(pdf_path), |
|
|
mode="page", |
|
|
extract_images=True, |
|
|
images_parser=TesseractBlobParser(), |
|
|
table_strategy="lines" |
|
|
) |
|
|
|
|
|
raw_documents = loader.load() |
|
|
|
|
|
documents = [] |
|
|
for idx, doc in enumerate(raw_documents): |
|
|
if doc.page_content.strip(): |
|
|
|
|
|
|
|
|
actual_page = doc.metadata.get("page") |
|
|
if actual_page is not None: |
|
|
|
|
|
page_num = actual_page + 1 if actual_page == idx else actual_page |
|
|
else: |
|
|
|
|
|
page_num = idx + 1 |
|
|
|
|
|
processed_doc = Document( |
|
|
page_content=doc.page_content, |
|
|
metadata={ |
|
|
"source": pdf_path.stem, |
|
|
"disease": disease, |
|
|
"provider": provider, |
|
|
"page_number": page_num |
|
|
} |
|
|
) |
|
|
documents.append(processed_doc) |
|
|
|
|
|
logger.info(f"Loaded {len(documents)} document pages from PDF - Disease: {disease}, Provider: {provider}") |
|
|
return documents |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error loading PDF documents: {str(e)}") |
|
|
raise |
|
|
|
|
|
|
|
|
def load_markdown_documents(md_path: Path) -> List[Document]: |
|
|
""" |
|
|
Load and process Markdown medical guidelines. |
|
|
Extracts disease and provider from directory structure. |
|
|
|
|
|
Directory structure expected: Data/Disease Name/Provider Name/file.md |
|
|
|
|
|
Args: |
|
|
md_path: Path to the Markdown file |
|
|
|
|
|
Returns: |
|
|
List of Document objects (single document split by sections if needed) |
|
|
""" |
|
|
try: |
|
|
|
|
|
if not md_path.exists(): |
|
|
raise FileNotFoundError(f"Markdown file not found at {md_path}") |
|
|
|
|
|
|
|
|
path_parts = md_path.parts |
|
|
disease = "unknown" |
|
|
provider = "unknown" |
|
|
|
|
|
if len(path_parts) >= 3: |
|
|
|
|
|
disease = path_parts[-3] if path_parts[-3].lower() != "data" else path_parts[-2] |
|
|
|
|
|
provider = path_parts[-2] |
|
|
|
|
|
|
|
|
with open(md_path, 'r', encoding='utf-8') as f: |
|
|
content = f.read() |
|
|
|
|
|
|
|
|
doc = Document( |
|
|
page_content=content, |
|
|
metadata={ |
|
|
"source": md_path.stem, |
|
|
"disease": disease, |
|
|
"provider": provider, |
|
|
"page_number": 1 |
|
|
} |
|
|
) |
|
|
|
|
|
logger.info(f"Loaded Markdown document - Disease: {disease}, Provider: {provider}") |
|
|
return [doc] |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error loading Markdown document: {str(e)}") |
|
|
raise |
|
|
|
|
|
|