Spaces:
Runtime error
Runtime error
| """ | |
| PDF processing utilities for extracting text, sections, and structured data from clinical documents. | |
| """ | |
| import os | |
| import re | |
| import fitz # PyMuPDF | |
| from typing import Dict, List, Tuple, Optional, Any | |
| import json | |
| from collections import defaultdict | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| class PDFProcessor: | |
| """Main class for PDF processing, extraction, and chunking.""" | |
| def __init__(self, upload_dir="./data/uploads"): | |
| """Initialize with the directory for uploaded PDFs.""" | |
| self.upload_dir = upload_dir | |
| os.makedirs(upload_dir, exist_ok=True) | |
| def save_uploaded_file(self, uploaded_file) -> str: | |
| """Save an uploaded file to disk and return the path.""" | |
| file_path = os.path.join(self.upload_dir, uploaded_file.name) | |
| with open(file_path, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| return file_path | |
| def extract_text_from_pdf(self, pdf_path: str) -> Tuple[str, List[Dict]]: | |
| """ | |
| Extract text from PDF with page numbers and attempt to identify section headers. | |
| Returns: | |
| Tuple containing: | |
| - Full text string | |
| - List of pages with text and page numbers | |
| """ | |
| try: | |
| doc = fitz.open(pdf_path) | |
| full_text = "" | |
| pages = [] | |
| for page_num, page in enumerate(doc): | |
| text = page.get_text() | |
| full_text += text + "\n\n" | |
| pages.append({ | |
| "page_num": page_num + 1, | |
| "text": text | |
| }) | |
| doc.close() | |
| return full_text, pages | |
| except Exception as e: | |
| print(f"Error extracting text from PDF {pdf_path}: {e}") | |
| return "", [] | |
| def identify_section_titles(self, text: str) -> List[Dict]: | |
| """ | |
| Identify potential section titles based on common patterns in clinical documents. | |
| Returns: | |
| List of dictionaries with section title and position info | |
| """ | |
| # Common patterns for section headers in protocols and SAPs | |
| patterns = [ | |
| # Numbered sections like "1. INTRODUCTION" or "2.3 Statistical Analysis" | |
| r'^(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z\s]+)$', | |
| # ALL CAPS headers like "OBJECTIVES AND ENDPOINTS" | |
| r'^([A-Z][A-Z\s]{3,})$', | |
| # Title case headers with optional trailing colon | |
| r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,5}):?$' | |
| ] | |
| sections = [] | |
| for line_num, line in enumerate(text.split('\n')): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| for pattern in patterns: | |
| matches = re.match(pattern, line) | |
| if matches: | |
| if len(matches.groups()) > 1: | |
| # For numbered patterns | |
| section_num, section_title = matches.groups() | |
| sections.append({ | |
| "section_num": section_num, | |
| "section_title": section_title.strip(), | |
| "line_num": line_num, | |
| "text": line | |
| }) | |
| else: | |
| # For unnumbered patterns | |
| section_title = matches.group(1) | |
| sections.append({ | |
| "section_num": None, | |
| "section_title": section_title.strip(), | |
| "line_num": line_num, | |
| "text": line | |
| }) | |
| break | |
| return sections | |
| def split_into_sections(self, full_text: str, filename: str) -> Dict[str, str]: | |
| """ | |
| Split the full text into logical sections based on identified section titles. | |
| Returns: | |
| Dictionary mapping section names to their text content | |
| """ | |
| # First identify potential section titles | |
| lines = full_text.split('\n') | |
| section_markers = self.identify_section_titles(full_text) | |
| if not section_markers: | |
| # If no sections found, treat the whole document as one section | |
| return {"document": full_text} | |
| # Sort section markers by line number | |
| section_markers.sort(key=lambda x: x["line_num"]) | |
| # Create sections | |
| sections = {} | |
| for i in range(len(section_markers)): | |
| start_line = section_markers[i]["line_num"] | |
| section_name = section_markers[i]["section_title"] | |
| # Determine end line (next section or end of document) | |
| if i < len(section_markers) - 1: | |
| end_line = section_markers[i+1]["line_num"] | |
| else: | |
| end_line = len(lines) | |
| # Extract section text | |
| section_text = '\n'.join(lines[start_line:end_line]) | |
| sections[section_name] = section_text | |
| return sections | |
| def chunk_text(self, text: str, metadata: Dict[str, Any], | |
| chunk_size: int = 1000, overlap: int = 200) -> List[Dict]: | |
| """ | |
| Split text into chunks suitable for embedding. | |
| Args: | |
| text: Text to chunk | |
| metadata: Metadata to include with each chunk | |
| chunk_size: Maximum size of each chunk | |
| overlap: Overlap between chunks | |
| Returns: | |
| List of dictionaries with page_content and metadata | |
| """ | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=overlap, | |
| length_function=len, | |
| ) | |
| chunks = text_splitter.create_documents( | |
| [text], | |
| metadatas=[metadata] | |
| ) | |
| return [{"page_content": chunk.page_content, "metadata": chunk.metadata} for chunk in chunks] | |
| def process_document_for_vector_store(self, pdf_path: str, | |
| document_metadata: Dict[str, Any]) -> List[Dict]: | |
| """ | |
| Process a document for storage in the vector store. | |
| Extract text, split into chunks, and add metadata. | |
| Args: | |
| pdf_path: Path to the PDF file | |
| document_metadata: Metadata about the document | |
| Returns: | |
| List of dictionaries with page_content and metadata ready for vector store | |
| """ | |
| full_text, pages = self.extract_text_from_pdf(pdf_path) | |
| sections = self.split_into_sections(full_text, os.path.basename(pdf_path)) | |
| all_chunks = [] | |
| # Process each section as its own set of chunks | |
| for section_name, section_text in sections.items(): | |
| section_metadata = document_metadata.copy() | |
| section_metadata.update({ | |
| "section": section_name, | |
| "source": os.path.basename(pdf_path) | |
| }) | |
| chunks = self.chunk_text(section_text, section_metadata) | |
| all_chunks.extend(chunks) | |
| return all_chunks | |
| def extract_tables_from_pdf(self, pdf_path: str) -> List[Dict]: | |
| """ | |
| Attempt to extract tables from the PDF. | |
| This is a simplified implementation and may not work for all PDFs. | |
| Returns: | |
| List of dictionaries with table info including page number and content | |
| """ | |
| # This is a placeholder. Table extraction from PDFs is complex and often | |
| # requires specialized libraries or even manual extraction/OCR | |
| # For a production system, consider tools like Camelot, Tabula, or commercial APIs | |
| return [] # Placeholder for actual table extraction | |
| def identify_document_type(self, text: str, filename: str) -> str: | |
| """ | |
| Attempt to identify the type of document (Protocol, SAP, etc.) | |
| based on content and filename patterns. | |
| Returns: | |
| String indicating document type | |
| """ | |
| lower_text = text.lower() | |
| lower_filename = filename.lower() | |
| # Check filename patterns | |
| if "protocol" in lower_filename or "prot" in lower_filename: | |
| return "Protocol" | |
| elif "sap" in lower_filename or "analysis plan" in lower_filename: | |
| return "Statistical Analysis Plan" | |
| elif "csr" in lower_filename or "study report" in lower_filename: | |
| return "Clinical Study Report" | |
| elif "ib" in lower_filename or "investigator" in lower_filename and "brochure" in lower_filename: | |
| return "Investigator Brochure" | |
| # Check content patterns | |
| if "statistical analysis plan" in lower_text: | |
| return "Statistical Analysis Plan" | |
| elif "clinical study protocol" in lower_text or "study protocol" in lower_text: | |
| return "Protocol" | |
| elif "clinical study report" in lower_text: | |
| return "Clinical Study Report" | |
| elif "investigator's brochure" in lower_text or "investigator brochure" in lower_text: | |
| return "Investigator Brochure" | |
| # Default | |
| return "Unknown" | |
| def extract_protocol_id(self, text: str, filename: str) -> Optional[str]: | |
| """ | |
| Attempt to extract the protocol ID from the document text or filename. | |
| Returns: | |
| Protocol ID string if found, None otherwise | |
| """ | |
| # Common patterns for protocol IDs | |
| patterns = [ | |
| # Common format like: Protocol B9531002 | |
| r'[Pp]rotocol\s+([A-Z][0-9]{5,}[A-Z0-9]*)', | |
| # Format with hyphen like: C5161-001 | |
| r'([A-Z][0-9]{4,}-[0-9]{3})', | |
| # Standard pattern like: ABC-123-456 | |
| r'([A-Z]{2,5}-[0-9]{2,3}-[0-9]{2,3})', | |
| # Simple alphanumeric like: XYZ12345 | |
| r'([A-Z]{2,5}[0-9]{4,6})' | |
| ] | |
| # Try to find in the first few hundred characters (often in the title) | |
| sample_text = text[:1000] | |
| for pattern in patterns: | |
| matches = re.search(pattern, sample_text) | |
| if matches: | |
| return matches.group(1) | |
| # Check filename | |
| for pattern in patterns: | |
| matches = re.search(pattern, filename) | |
| if matches: | |
| return matches.group(1) | |
| return None | |
| def extract_basic_metadata(self, pdf_path: str) -> Dict[str, Any]: | |
| """ | |
| Extract basic metadata from a PDF without detailed structure extraction. | |
| Returns: | |
| Dictionary with basic document metadata | |
| """ | |
| filename = os.path.basename(pdf_path) | |
| full_text, _ = self.extract_text_from_pdf(pdf_path) | |
| # Sample the first part of the document | |
| sample_text = full_text[:5000] | |
| # Extract potential protocol ID | |
| protocol_id = self.extract_protocol_id(sample_text, filename) | |
| # Determine document type | |
| doc_type = self.identify_document_type(sample_text, filename) | |
| # Extract title (usually in the first few lines) | |
| lines = sample_text.split('\n') | |
| title = next((line.strip() for line in lines if len(line.strip()) > 20 and len(line.strip()) < 200), "Unknown Title") | |
| # Create basic metadata | |
| metadata = { | |
| "document_id": os.path.splitext(filename)[0], | |
| "filename": filename, | |
| "protocol_id": protocol_id, | |
| "type": doc_type, | |
| "title": title, | |
| "path": pdf_path | |
| } | |
| return metadata | |
| def process_complete_document(self, pdf_path: str) -> Dict[str, Any]: | |
| """ | |
| Process a complete document for both structured data and vector storage. | |
| This is the main entry point for document processing. | |
| Returns: | |
| Dictionary with processing results | |
| """ | |
| results = { | |
| "status": "success", | |
| "pdf_path": pdf_path, | |
| "filename": os.path.basename(pdf_path) | |
| } | |
| try: | |
| # Step 1: Extract basic metadata | |
| metadata = self.extract_basic_metadata(pdf_path) | |
| results["metadata"] = metadata | |
| # Step 2: Extract full text and split into sections | |
| full_text, pages = self.extract_text_from_pdf(pdf_path) | |
| sections = self.split_into_sections(full_text, os.path.basename(pdf_path)) | |
| results["sections"] = sections # Store the entire sections dictionary | |
| results["page_count"] = len(pages) | |
| # Step 3: Prepare chunks for vector store | |
| chunks = self.process_document_for_vector_store(pdf_path, metadata) | |
| results["chunk_count"] = len(chunks) | |
| results["chunks"] = chunks | |
| return results | |
| except Exception as e: | |
| results["status"] = "error" | |
| results["error"] = str(e) | |
| return results |