Spaces:

cryogenic22
/

doc_knowledge_base

Runtime error

File size: 13,240 Bytes

"""
PDF processing utilities for extracting text, sections, and structured data from clinical documents.
"""

import os
import re
import fitz  # PyMuPDF
from typing import Dict, List, Tuple, Optional, Any
import json
from collections import defaultdict
from langchain.text_splitter import RecursiveCharacterTextSplitter

class PDFProcessor:
    """Main class for PDF processing, extraction, and chunking."""
    
    def __init__(self, upload_dir="./data/uploads"):
        """Initialize with the directory for uploaded PDFs."""
        self.upload_dir = upload_dir
        os.makedirs(upload_dir, exist_ok=True)

    def save_uploaded_file(self, uploaded_file) -> str:
        """Save an uploaded file to disk and return the path."""
        file_path = os.path.join(self.upload_dir, uploaded_file.name)
        with open(file_path, "wb") as f:
            f.write(uploaded_file.getbuffer())
        return file_path

    def extract_text_from_pdf(self, pdf_path: str) -> Tuple[str, List[Dict]]:
        """
        Extract text from PDF with page numbers and attempt to identify section headers.
        
        Returns:
            Tuple containing:
                - Full text string
                - List of pages with text and page numbers
        """
        try:
            doc = fitz.open(pdf_path)
            full_text = ""
            pages = []
            
            for page_num, page in enumerate(doc):
                text = page.get_text()
                full_text += text + "\n\n"
                pages.append({
                    "page_num": page_num + 1,
                    "text": text
                })
            
            doc.close()
            return full_text, pages
        except Exception as e:
            print(f"Error extracting text from PDF {pdf_path}: {e}")
            return "", []

    def identify_section_titles(self, text: str) -> List[Dict]:
        """
        Identify potential section titles based on common patterns in clinical documents.
        
        Returns:
            List of dictionaries with section title and position info
        """
        # Common patterns for section headers in protocols and SAPs
        patterns = [
            # Numbered sections like "1. INTRODUCTION" or "2.3 Statistical Analysis"
            r'^(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z\s]+)$',
            # ALL CAPS headers like "OBJECTIVES AND ENDPOINTS"
            r'^([A-Z][A-Z\s]{3,})$',
            # Title case headers with optional trailing colon
            r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,5}):?$'
        ]
        
        sections = []
        for line_num, line in enumerate(text.split('\n')):
            line = line.strip()
            if not line:
                continue
                
            for pattern in patterns:
                matches = re.match(pattern, line)
                if matches:
                    if len(matches.groups()) > 1:
                        # For numbered patterns
                        section_num, section_title = matches.groups()
                        sections.append({
                            "section_num": section_num,
                            "section_title": section_title.strip(),
                            "line_num": line_num,
                            "text": line
                        })
                    else:
                        # For unnumbered patterns
                        section_title = matches.group(1)
                        sections.append({
                            "section_num": None,
                            "section_title": section_title.strip(),
                            "line_num": line_num,
                            "text": line
                        })
                    break
        
        return sections

    def split_into_sections(self, full_text: str, filename: str) -> Dict[str, str]:
        """
        Split the full text into logical sections based on identified section titles.
        
        Returns:
            Dictionary mapping section names to their text content
        """
        # First identify potential section titles
        lines = full_text.split('\n')
        section_markers = self.identify_section_titles(full_text)
        
        if not section_markers:
            # If no sections found, treat the whole document as one section
            return {"document": full_text}
        
        # Sort section markers by line number
        section_markers.sort(key=lambda x: x["line_num"])
        
        # Create sections
        sections = {}
        for i in range(len(section_markers)):
            start_line = section_markers[i]["line_num"]
            section_name = section_markers[i]["section_title"]
            
            # Determine end line (next section or end of document)
            if i < len(section_markers) - 1:
                end_line = section_markers[i+1]["line_num"]
            else:
                end_line = len(lines)
            
            # Extract section text
            section_text = '\n'.join(lines[start_line:end_line])
            sections[section_name] = section_text
        
        return sections

    def chunk_text(self, text: str, metadata: Dict[str, Any], 
                  chunk_size: int = 1000, overlap: int = 200) -> List[Dict]:
        """
        Split text into chunks suitable for embedding.
        
        Args:
            text: Text to chunk
            metadata: Metadata to include with each chunk
            chunk_size: Maximum size of each chunk
            overlap: Overlap between chunks
            
        Returns:
            List of dictionaries with page_content and metadata
        """
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=overlap,
            length_function=len,
        )
        
        chunks = text_splitter.create_documents(
            [text], 
            metadatas=[metadata]
        )
        
        return [{"page_content": chunk.page_content, "metadata": chunk.metadata} for chunk in chunks]

    def process_document_for_vector_store(self, pdf_path: str, 
                                         document_metadata: Dict[str, Any]) -> List[Dict]:
        """
        Process a document for storage in the vector store.
        Extract text, split into chunks, and add metadata.
        
        Args:
            pdf_path: Path to the PDF file
            document_metadata: Metadata about the document
            
        Returns:
            List of dictionaries with page_content and metadata ready for vector store
        """
        full_text, pages = self.extract_text_from_pdf(pdf_path)
        sections = self.split_into_sections(full_text, os.path.basename(pdf_path))
        
        all_chunks = []
        
        # Process each section as its own set of chunks
        for section_name, section_text in sections.items():
            section_metadata = document_metadata.copy()
            section_metadata.update({
                "section": section_name,
                "source": os.path.basename(pdf_path)
            })
            
            chunks = self.chunk_text(section_text, section_metadata)
            all_chunks.extend(chunks)
        
        return all_chunks

    def extract_tables_from_pdf(self, pdf_path: str) -> List[Dict]:
        """
        Attempt to extract tables from the PDF.
        This is a simplified implementation and may not work for all PDFs.
        
        Returns:
            List of dictionaries with table info including page number and content
        """
        # This is a placeholder. Table extraction from PDFs is complex and often
        # requires specialized libraries or even manual extraction/OCR
        # For a production system, consider tools like Camelot, Tabula, or commercial APIs
        
        return []  # Placeholder for actual table extraction

    def identify_document_type(self, text: str, filename: str) -> str:
        """
        Attempt to identify the type of document (Protocol, SAP, etc.)
        based on content and filename patterns.
        
        Returns:
            String indicating document type
        """
        lower_text = text.lower()
        lower_filename = filename.lower()
        
        # Check filename patterns
        if "protocol" in lower_filename or "prot" in lower_filename:
            return "Protocol"
        elif "sap" in lower_filename or "analysis plan" in lower_filename:
            return "Statistical Analysis Plan"
        elif "csr" in lower_filename or "study report" in lower_filename:
            return "Clinical Study Report"
        elif "ib" in lower_filename or "investigator" in lower_filename and "brochure" in lower_filename:
            return "Investigator Brochure"
        
        # Check content patterns
        if "statistical analysis plan" in lower_text:
            return "Statistical Analysis Plan"
        elif "clinical study protocol" in lower_text or "study protocol" in lower_text:
            return "Protocol"
        elif "clinical study report" in lower_text:
            return "Clinical Study Report"
        elif "investigator's brochure" in lower_text or "investigator brochure" in lower_text:
            return "Investigator Brochure"
        
        # Default
        return "Unknown"

    def extract_protocol_id(self, text: str, filename: str) -> Optional[str]:
        """
        Attempt to extract the protocol ID from the document text or filename.
        
        Returns:
            Protocol ID string if found, None otherwise
        """
        # Common patterns for protocol IDs
        patterns = [
            # Common format like: Protocol B9531002
            r'[Pp]rotocol\s+([A-Z][0-9]{5,}[A-Z0-9]*)',
            # Format with hyphen like: C5161-001
            r'([A-Z][0-9]{4,}-[0-9]{3})',
            # Standard pattern like: ABC-123-456
            r'([A-Z]{2,5}-[0-9]{2,3}-[0-9]{2,3})',
            # Simple alphanumeric like: XYZ12345
            r'([A-Z]{2,5}[0-9]{4,6})'
        ]
        
        # Try to find in the first few hundred characters (often in the title)
        sample_text = text[:1000]
        
        for pattern in patterns:
            matches = re.search(pattern, sample_text)
            if matches:
                return matches.group(1)
        
        # Check filename
        for pattern in patterns:
            matches = re.search(pattern, filename)
            if matches:
                return matches.group(1)
        
        return None

    def extract_basic_metadata(self, pdf_path: str) -> Dict[str, Any]:
        """
        Extract basic metadata from a PDF without detailed structure extraction.
        
        Returns:
            Dictionary with basic document metadata
        """
        filename = os.path.basename(pdf_path)
        full_text, _ = self.extract_text_from_pdf(pdf_path)
        
        # Sample the first part of the document
        sample_text = full_text[:5000]
        
        # Extract potential protocol ID
        protocol_id = self.extract_protocol_id(sample_text, filename)
        
        # Determine document type
        doc_type = self.identify_document_type(sample_text, filename)
        
        # Extract title (usually in the first few lines)
        lines = sample_text.split('\n')
        title = next((line.strip() for line in lines if len(line.strip()) > 20 and len(line.strip()) < 200), "Unknown Title")
        
        # Create basic metadata
        metadata = {
            "document_id": os.path.splitext(filename)[0],
            "filename": filename,
            "protocol_id": protocol_id,
            "type": doc_type,
            "title": title,
            "path": pdf_path
        }
        
        return metadata
    
    def process_complete_document(self, pdf_path: str) -> Dict[str, Any]:
        """
        Process a complete document for both structured data and vector storage.
        This is the main entry point for document processing.
        
        Returns:
            Dictionary with processing results
        """
        results = {
            "status": "success",
            "pdf_path": pdf_path,
            "filename": os.path.basename(pdf_path)
        }
        
        try:
            # Step 1: Extract basic metadata
            metadata = self.extract_basic_metadata(pdf_path)
            results["metadata"] = metadata
            
            # Step 2: Extract full text and split into sections
            full_text, pages = self.extract_text_from_pdf(pdf_path)
            sections = self.split_into_sections(full_text, os.path.basename(pdf_path))
            results["sections"] = sections  # Store the entire sections dictionary
            results["page_count"] = len(pages)
            
            # Step 3: Prepare chunks for vector store
            chunks = self.process_document_for_vector_store(pdf_path, metadata)
            results["chunk_count"] = len(chunks)
            results["chunks"] = chunks
            
            return results
        except Exception as e:
            results["status"] = "error"
            results["error"] = str(e)
            return results