Spaces:
Sleeping
Sleeping
File size: 4,144 Bytes
2a8faae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# Import required libraries
import pandas as pd
from pathlib import Path
from typing import List
from langchain.schema import Document
from .config import logger
from langchain_pymupdf4llm import PyMuPDF4LLMLoader
from langchain_community.document_loaders.parsers import TesseractBlobParser
def load_pdf_documents(pdf_path: Path) -> List[Document]:
"""
Load and process PDF documents from medical guidelines using PyMuPDF4LLMLoader.
Uses Tesseract for image extraction and optimized table extraction for medical documents.
Extracts disease and provider from directory structure.
Directory structure expected: Data/Disease Name/Provider Name/file.pdf
Args:
pdf_path: Path to the PDF file
Returns:
List of Document objects
"""
try:
# Validate file exists
if not pdf_path.exists():
raise FileNotFoundError(f"PDF file not found at {pdf_path}")
# Extract disease and provider from directory structure
path_parts = pdf_path.parts
disease = "unknown"
provider = "unknown"
if len(path_parts) >= 3:
# Get disease (parent's parent directory)
disease = path_parts[-3] if path_parts[-3].lower() != "data" else path_parts[-2]
# Get provider (parent directory)
provider = path_parts[-2]
# Initialize PyMuPDF4LLMLoader
loader = PyMuPDF4LLMLoader(
str(pdf_path),
mode="page",
extract_images=True,
images_parser=TesseractBlobParser(),
table_strategy="lines"
)
raw_documents = loader.load()
documents = []
for idx, doc in enumerate(raw_documents):
if doc.page_content.strip():
processed_doc = Document(
page_content=doc.page_content,
metadata={
"source": pdf_path.name,
"disease": disease,
"provider": provider,
"page_number": doc.metadata.get("page", idx + 1)
}
)
documents.append(processed_doc)
logger.info(f"Loaded {len(documents)} document pages from PDF - Disease: {disease}, Provider: {provider}")
return documents
except Exception as e:
logger.error(f"Error loading PDF documents: {str(e)}")
raise
def load_markdown_documents(md_path: Path) -> List[Document]:
"""
Load and process Markdown medical guidelines.
Extracts disease and provider from directory structure.
Directory structure expected: Data/Disease Name/Provider Name/file.md
Args:
md_path: Path to the Markdown file
Returns:
List of Document objects (single document split by sections if needed)
"""
try:
# Validate file exists
if not md_path.exists():
raise FileNotFoundError(f"Markdown file not found at {md_path}")
# Extract disease and provider from directory structure
path_parts = md_path.parts
disease = "unknown"
provider = "unknown"
if len(path_parts) >= 3:
# Get disease (parent's parent directory)
disease = path_parts[-3] if path_parts[-3].lower() != "data" else path_parts[-2]
# Get provider (parent directory)
provider = path_parts[-2]
# Read markdown content
with open(md_path, 'r', encoding='utf-8') as f:
content = f.read()
# Create document with minimal metadata for RAG
doc = Document(
page_content=content,
metadata={
"source": md_path.name,
"disease": disease,
"provider": provider,
"page_number": 1
}
)
logger.info(f"Loaded Markdown document - Disease: {disease}, Provider: {provider}")
return [doc]
except Exception as e:
logger.error(f"Error loading Markdown document: {str(e)}")
raise
|