""" PDF processing utilities for extracting text, sections, and structured data from clinical documents. """ import os import re import fitz # PyMuPDF from typing import Dict, List, Tuple, Optional, Any import json from collections import defaultdict from langchain.text_splitter import RecursiveCharacterTextSplitter class PDFProcessor: """Main class for PDF processing, extraction, and chunking.""" def __init__(self, upload_dir="./data/uploads"): """Initialize with the directory for uploaded PDFs.""" self.upload_dir = upload_dir os.makedirs(upload_dir, exist_ok=True) def save_uploaded_file(self, uploaded_file) -> str: """Save an uploaded file to disk and return the path.""" file_path = os.path.join(self.upload_dir, uploaded_file.name) with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) return file_path def extract_text_from_pdf(self, pdf_path: str) -> Tuple[str, List[Dict]]: """ Extract text from PDF with page numbers and attempt to identify section headers. Returns: Tuple containing: - Full text string - List of pages with text and page numbers """ try: doc = fitz.open(pdf_path) full_text = "" pages = [] for page_num, page in enumerate(doc): text = page.get_text() full_text += text + "\n\n" pages.append({ "page_num": page_num + 1, "text": text }) doc.close() return full_text, pages except Exception as e: print(f"Error extracting text from PDF {pdf_path}: {e}") return "", [] def identify_section_titles(self, text: str) -> List[Dict]: """ Identify potential section titles based on common patterns in clinical documents. Returns: List of dictionaries with section title and position info """ # Common patterns for section headers in protocols and SAPs patterns = [ # Numbered sections like "1. INTRODUCTION" or "2.3 Statistical Analysis" r'^(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z\s]+)$', # ALL CAPS headers like "OBJECTIVES AND ENDPOINTS" r'^([A-Z][A-Z\s]{3,})$', # Title case headers with optional trailing colon r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,5}):?$' ] sections = [] for line_num, line in enumerate(text.split('\n')): line = line.strip() if not line: continue for pattern in patterns: matches = re.match(pattern, line) if matches: if len(matches.groups()) > 1: # For numbered patterns section_num, section_title = matches.groups() sections.append({ "section_num": section_num, "section_title": section_title.strip(), "line_num": line_num, "text": line }) else: # For unnumbered patterns section_title = matches.group(1) sections.append({ "section_num": None, "section_title": section_title.strip(), "line_num": line_num, "text": line }) break return sections def split_into_sections(self, full_text: str, filename: str) -> Dict[str, str]: """ Split the full text into logical sections based on identified section titles. Returns: Dictionary mapping section names to their text content """ # First identify potential section titles lines = full_text.split('\n') section_markers = self.identify_section_titles(full_text) if not section_markers: # If no sections found, treat the whole document as one section return {"document": full_text} # Sort section markers by line number section_markers.sort(key=lambda x: x["line_num"]) # Create sections sections = {} for i in range(len(section_markers)): start_line = section_markers[i]["line_num"] section_name = section_markers[i]["section_title"] # Determine end line (next section or end of document) if i < len(section_markers) - 1: end_line = section_markers[i+1]["line_num"] else: end_line = len(lines) # Extract section text section_text = '\n'.join(lines[start_line:end_line]) sections[section_name] = section_text return sections def chunk_text(self, text: str, metadata: Dict[str, Any], chunk_size: int = 1000, overlap: int = 200) -> List[Dict]: """ Split text into chunks suitable for embedding. Args: text: Text to chunk metadata: Metadata to include with each chunk chunk_size: Maximum size of each chunk overlap: Overlap between chunks Returns: List of dictionaries with page_content and metadata """ text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=overlap, length_function=len, ) chunks = text_splitter.create_documents( [text], metadatas=[metadata] ) return [{"page_content": chunk.page_content, "metadata": chunk.metadata} for chunk in chunks] def process_document_for_vector_store(self, pdf_path: str, document_metadata: Dict[str, Any]) -> List[Dict]: """ Process a document for storage in the vector store. Extract text, split into chunks, and add metadata. Args: pdf_path: Path to the PDF file document_metadata: Metadata about the document Returns: List of dictionaries with page_content and metadata ready for vector store """ full_text, pages = self.extract_text_from_pdf(pdf_path) sections = self.split_into_sections(full_text, os.path.basename(pdf_path)) all_chunks = [] # Process each section as its own set of chunks for section_name, section_text in sections.items(): section_metadata = document_metadata.copy() section_metadata.update({ "section": section_name, "source": os.path.basename(pdf_path) }) chunks = self.chunk_text(section_text, section_metadata) all_chunks.extend(chunks) return all_chunks def extract_tables_from_pdf(self, pdf_path: str) -> List[Dict]: """ Attempt to extract tables from the PDF. This is a simplified implementation and may not work for all PDFs. Returns: List of dictionaries with table info including page number and content """ # This is a placeholder. Table extraction from PDFs is complex and often # requires specialized libraries or even manual extraction/OCR # For a production system, consider tools like Camelot, Tabula, or commercial APIs return [] # Placeholder for actual table extraction def identify_document_type(self, text: str, filename: str) -> str: """ Attempt to identify the type of document (Protocol, SAP, etc.) based on content and filename patterns. Returns: String indicating document type """ lower_text = text.lower() lower_filename = filename.lower() # Check filename patterns if "protocol" in lower_filename or "prot" in lower_filename: return "Protocol" elif "sap" in lower_filename or "analysis plan" in lower_filename: return "Statistical Analysis Plan" elif "csr" in lower_filename or "study report" in lower_filename: return "Clinical Study Report" elif "ib" in lower_filename or "investigator" in lower_filename and "brochure" in lower_filename: return "Investigator Brochure" # Check content patterns if "statistical analysis plan" in lower_text: return "Statistical Analysis Plan" elif "clinical study protocol" in lower_text or "study protocol" in lower_text: return "Protocol" elif "clinical study report" in lower_text: return "Clinical Study Report" elif "investigator's brochure" in lower_text or "investigator brochure" in lower_text: return "Investigator Brochure" # Default return "Unknown" def extract_protocol_id(self, text: str, filename: str) -> Optional[str]: """ Attempt to extract the protocol ID from the document text or filename. Returns: Protocol ID string if found, None otherwise """ # Common patterns for protocol IDs patterns = [ # Common format like: Protocol B9531002 r'[Pp]rotocol\s+([A-Z][0-9]{5,}[A-Z0-9]*)', # Format with hyphen like: C5161-001 r'([A-Z][0-9]{4,}-[0-9]{3})', # Standard pattern like: ABC-123-456 r'([A-Z]{2,5}-[0-9]{2,3}-[0-9]{2,3})', # Simple alphanumeric like: XYZ12345 r'([A-Z]{2,5}[0-9]{4,6})' ] # Try to find in the first few hundred characters (often in the title) sample_text = text[:1000] for pattern in patterns: matches = re.search(pattern, sample_text) if matches: return matches.group(1) # Check filename for pattern in patterns: matches = re.search(pattern, filename) if matches: return matches.group(1) return None def extract_basic_metadata(self, pdf_path: str) -> Dict[str, Any]: """ Extract basic metadata from a PDF without detailed structure extraction. Returns: Dictionary with basic document metadata """ filename = os.path.basename(pdf_path) full_text, _ = self.extract_text_from_pdf(pdf_path) # Sample the first part of the document sample_text = full_text[:5000] # Extract potential protocol ID protocol_id = self.extract_protocol_id(sample_text, filename) # Determine document type doc_type = self.identify_document_type(sample_text, filename) # Extract title (usually in the first few lines) lines = sample_text.split('\n') title = next((line.strip() for line in lines if len(line.strip()) > 20 and len(line.strip()) < 200), "Unknown Title") # Create basic metadata metadata = { "document_id": os.path.splitext(filename)[0], "filename": filename, "protocol_id": protocol_id, "type": doc_type, "title": title, "path": pdf_path } return metadata def process_complete_document(self, pdf_path: str) -> Dict[str, Any]: """ Process a complete document for both structured data and vector storage. This is the main entry point for document processing. Returns: Dictionary with processing results """ results = { "status": "success", "pdf_path": pdf_path, "filename": os.path.basename(pdf_path) } try: # Step 1: Extract basic metadata metadata = self.extract_basic_metadata(pdf_path) results["metadata"] = metadata # Step 2: Extract full text and split into sections full_text, pages = self.extract_text_from_pdf(pdf_path) sections = self.split_into_sections(full_text, os.path.basename(pdf_path)) results["sections"] = sections # Store the entire sections dictionary results["page_count"] = len(pages) # Step 3: Prepare chunks for vector store chunks = self.process_document_for_vector_store(pdf_path, metadata) results["chunk_count"] = len(chunks) results["chunks"] = chunks return results except Exception as e: results["status"] = "error" results["error"] = str(e) return results