doc_knowledge_base / pdf_processor.py
cryogenic22's picture
Update pdf_processor.py
88bb2e2 verified
"""
PDF processing utilities for extracting text, sections, and structured data from clinical documents.
"""
import os
import re
import fitz # PyMuPDF
from typing import Dict, List, Tuple, Optional, Any
import json
from collections import defaultdict
from langchain.text_splitter import RecursiveCharacterTextSplitter
class PDFProcessor:
"""Main class for PDF processing, extraction, and chunking."""
def __init__(self, upload_dir="./data/uploads"):
"""Initialize with the directory for uploaded PDFs."""
self.upload_dir = upload_dir
os.makedirs(upload_dir, exist_ok=True)
def save_uploaded_file(self, uploaded_file) -> str:
"""Save an uploaded file to disk and return the path."""
file_path = os.path.join(self.upload_dir, uploaded_file.name)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
return file_path
def extract_text_from_pdf(self, pdf_path: str) -> Tuple[str, List[Dict]]:
"""
Extract text from PDF with page numbers and attempt to identify section headers.
Returns:
Tuple containing:
- Full text string
- List of pages with text and page numbers
"""
try:
doc = fitz.open(pdf_path)
full_text = ""
pages = []
for page_num, page in enumerate(doc):
text = page.get_text()
full_text += text + "\n\n"
pages.append({
"page_num": page_num + 1,
"text": text
})
doc.close()
return full_text, pages
except Exception as e:
print(f"Error extracting text from PDF {pdf_path}: {e}")
return "", []
def identify_section_titles(self, text: str) -> List[Dict]:
"""
Identify potential section titles based on common patterns in clinical documents.
Returns:
List of dictionaries with section title and position info
"""
# Common patterns for section headers in protocols and SAPs
patterns = [
# Numbered sections like "1. INTRODUCTION" or "2.3 Statistical Analysis"
r'^(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z\s]+)$',
# ALL CAPS headers like "OBJECTIVES AND ENDPOINTS"
r'^([A-Z][A-Z\s]{3,})$',
# Title case headers with optional trailing colon
r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,5}):?$'
]
sections = []
for line_num, line in enumerate(text.split('\n')):
line = line.strip()
if not line:
continue
for pattern in patterns:
matches = re.match(pattern, line)
if matches:
if len(matches.groups()) > 1:
# For numbered patterns
section_num, section_title = matches.groups()
sections.append({
"section_num": section_num,
"section_title": section_title.strip(),
"line_num": line_num,
"text": line
})
else:
# For unnumbered patterns
section_title = matches.group(1)
sections.append({
"section_num": None,
"section_title": section_title.strip(),
"line_num": line_num,
"text": line
})
break
return sections
def split_into_sections(self, full_text: str, filename: str) -> Dict[str, str]:
"""
Split the full text into logical sections based on identified section titles.
Returns:
Dictionary mapping section names to their text content
"""
# First identify potential section titles
lines = full_text.split('\n')
section_markers = self.identify_section_titles(full_text)
if not section_markers:
# If no sections found, treat the whole document as one section
return {"document": full_text}
# Sort section markers by line number
section_markers.sort(key=lambda x: x["line_num"])
# Create sections
sections = {}
for i in range(len(section_markers)):
start_line = section_markers[i]["line_num"]
section_name = section_markers[i]["section_title"]
# Determine end line (next section or end of document)
if i < len(section_markers) - 1:
end_line = section_markers[i+1]["line_num"]
else:
end_line = len(lines)
# Extract section text
section_text = '\n'.join(lines[start_line:end_line])
sections[section_name] = section_text
return sections
def chunk_text(self, text: str, metadata: Dict[str, Any],
chunk_size: int = 1000, overlap: int = 200) -> List[Dict]:
"""
Split text into chunks suitable for embedding.
Args:
text: Text to chunk
metadata: Metadata to include with each chunk
chunk_size: Maximum size of each chunk
overlap: Overlap between chunks
Returns:
List of dictionaries with page_content and metadata
"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
length_function=len,
)
chunks = text_splitter.create_documents(
[text],
metadatas=[metadata]
)
return [{"page_content": chunk.page_content, "metadata": chunk.metadata} for chunk in chunks]
def process_document_for_vector_store(self, pdf_path: str,
document_metadata: Dict[str, Any]) -> List[Dict]:
"""
Process a document for storage in the vector store.
Extract text, split into chunks, and add metadata.
Args:
pdf_path: Path to the PDF file
document_metadata: Metadata about the document
Returns:
List of dictionaries with page_content and metadata ready for vector store
"""
full_text, pages = self.extract_text_from_pdf(pdf_path)
sections = self.split_into_sections(full_text, os.path.basename(pdf_path))
all_chunks = []
# Process each section as its own set of chunks
for section_name, section_text in sections.items():
section_metadata = document_metadata.copy()
section_metadata.update({
"section": section_name,
"source": os.path.basename(pdf_path)
})
chunks = self.chunk_text(section_text, section_metadata)
all_chunks.extend(chunks)
return all_chunks
def extract_tables_from_pdf(self, pdf_path: str) -> List[Dict]:
"""
Attempt to extract tables from the PDF.
This is a simplified implementation and may not work for all PDFs.
Returns:
List of dictionaries with table info including page number and content
"""
# This is a placeholder. Table extraction from PDFs is complex and often
# requires specialized libraries or even manual extraction/OCR
# For a production system, consider tools like Camelot, Tabula, or commercial APIs
return [] # Placeholder for actual table extraction
def identify_document_type(self, text: str, filename: str) -> str:
"""
Attempt to identify the type of document (Protocol, SAP, etc.)
based on content and filename patterns.
Returns:
String indicating document type
"""
lower_text = text.lower()
lower_filename = filename.lower()
# Check filename patterns
if "protocol" in lower_filename or "prot" in lower_filename:
return "Protocol"
elif "sap" in lower_filename or "analysis plan" in lower_filename:
return "Statistical Analysis Plan"
elif "csr" in lower_filename or "study report" in lower_filename:
return "Clinical Study Report"
elif "ib" in lower_filename or "investigator" in lower_filename and "brochure" in lower_filename:
return "Investigator Brochure"
# Check content patterns
if "statistical analysis plan" in lower_text:
return "Statistical Analysis Plan"
elif "clinical study protocol" in lower_text or "study protocol" in lower_text:
return "Protocol"
elif "clinical study report" in lower_text:
return "Clinical Study Report"
elif "investigator's brochure" in lower_text or "investigator brochure" in lower_text:
return "Investigator Brochure"
# Default
return "Unknown"
def extract_protocol_id(self, text: str, filename: str) -> Optional[str]:
"""
Attempt to extract the protocol ID from the document text or filename.
Returns:
Protocol ID string if found, None otherwise
"""
# Common patterns for protocol IDs
patterns = [
# Common format like: Protocol B9531002
r'[Pp]rotocol\s+([A-Z][0-9]{5,}[A-Z0-9]*)',
# Format with hyphen like: C5161-001
r'([A-Z][0-9]{4,}-[0-9]{3})',
# Standard pattern like: ABC-123-456
r'([A-Z]{2,5}-[0-9]{2,3}-[0-9]{2,3})',
# Simple alphanumeric like: XYZ12345
r'([A-Z]{2,5}[0-9]{4,6})'
]
# Try to find in the first few hundred characters (often in the title)
sample_text = text[:1000]
for pattern in patterns:
matches = re.search(pattern, sample_text)
if matches:
return matches.group(1)
# Check filename
for pattern in patterns:
matches = re.search(pattern, filename)
if matches:
return matches.group(1)
return None
def extract_basic_metadata(self, pdf_path: str) -> Dict[str, Any]:
"""
Extract basic metadata from a PDF without detailed structure extraction.
Returns:
Dictionary with basic document metadata
"""
filename = os.path.basename(pdf_path)
full_text, _ = self.extract_text_from_pdf(pdf_path)
# Sample the first part of the document
sample_text = full_text[:5000]
# Extract potential protocol ID
protocol_id = self.extract_protocol_id(sample_text, filename)
# Determine document type
doc_type = self.identify_document_type(sample_text, filename)
# Extract title (usually in the first few lines)
lines = sample_text.split('\n')
title = next((line.strip() for line in lines if len(line.strip()) > 20 and len(line.strip()) < 200), "Unknown Title")
# Create basic metadata
metadata = {
"document_id": os.path.splitext(filename)[0],
"filename": filename,
"protocol_id": protocol_id,
"type": doc_type,
"title": title,
"path": pdf_path
}
return metadata
def process_complete_document(self, pdf_path: str) -> Dict[str, Any]:
"""
Process a complete document for both structured data and vector storage.
This is the main entry point for document processing.
Returns:
Dictionary with processing results
"""
results = {
"status": "success",
"pdf_path": pdf_path,
"filename": os.path.basename(pdf_path)
}
try:
# Step 1: Extract basic metadata
metadata = self.extract_basic_metadata(pdf_path)
results["metadata"] = metadata
# Step 2: Extract full text and split into sections
full_text, pages = self.extract_text_from_pdf(pdf_path)
sections = self.split_into_sections(full_text, os.path.basename(pdf_path))
results["sections"] = sections # Store the entire sections dictionary
results["page_count"] = len(pages)
# Step 3: Prepare chunks for vector store
chunks = self.process_document_for_vector_store(pdf_path, metadata)
results["chunk_count"] = len(chunks)
results["chunks"] = chunks
return results
except Exception as e:
results["status"] = "error"
results["error"] = str(e)
return results