Spaces:

cryogenic22
/

doc_knowledge_base

Runtime error

App Files Files Community

cryogenic22 commited on Apr 19, 2025

Commit

46eb9e8

verified ·

1 Parent(s): 6ed286a

Create pdf_processor.py

Browse files

Files changed (1) hide show

pdf_processor.py +350 -0

pdf_processor.py ADDED Viewed

	@@ -0,0 +1,350 @@

+"""
+PDF processing utilities for extracting text, sections, and structured data from clinical documents.
+"""
+import os
+import re
+import fitz  # PyMuPDF
+from typing import Dict, List, Tuple, Optional, Any
+import json
+from collections import defaultdict
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+class PDFProcessor:
+    """Main class for PDF processing, extraction, and chunking."""
+    def __init__(self, upload_dir="./data/uploads"):
+        """Initialize with the directory for uploaded PDFs."""
+        self.upload_dir = upload_dir
+        os.makedirs(upload_dir, exist_ok=True)
+    def save_uploaded_file(self, uploaded_file) -> str:
+        """Save an uploaded file to disk and return the path."""
+        file_path = os.path.join(self.upload_dir, uploaded_file.name)
+        with open(file_path, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        return file_path
+    def extract_text_from_pdf(self, pdf_path: str) -> Tuple[str, List[Dict]]:
+        """
+        Extract text from PDF with page numbers and attempt to identify section headers.
+        Returns:
+            Tuple containing:
+                - Full text string
+                - List of pages with text and page numbers
+        """
+        try:
+            doc = fitz.open(pdf_path)
+            full_text = ""
+            pages = []
+            for page_num, page in enumerate(doc):
+                text = page.get_text()
+                full_text += text + "\n\n"
+                pages.append({
+                    "page_num": page_num + 1,
+                    "text": text
+                })
+            doc.close()
+            return full_text, pages
+        except Exception as e:
+            print(f"Error extracting text from PDF {pdf_path}: {e}")
+            return "", []
+    def identify_section_titles(self, text: str) -> List[Dict]:
+        """
+        Identify potential section titles based on common patterns in clinical documents.
+        Returns:
+            List of dictionaries with section title and position info
+        """
+        # Common patterns for section headers in protocols and SAPs
+        patterns = [
+            # Numbered sections like "1. INTRODUCTION" or "2.3 Statistical Analysis"
+            r'^(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z\s]+)$',
+            # ALL CAPS headers like "OBJECTIVES AND ENDPOINTS"
+            r'^([A-Z][A-Z\s]{3,})$',
+            # Title case headers with optional trailing colon
+            r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,5}):?$'
+        ]
+        sections = []
+        for line_num, line in enumerate(text.split('\n')):
+            line = line.strip()
+            if not line:
+                continue
+            for pattern in patterns:
+                matches = re.match(pattern, line)
+                if matches:
+                    if len(matches.groups()) > 1:
+                        # For numbered patterns
+                        section_num, section_title = matches.groups()
+                        sections.append({
+                            "section_num": section_num,
+                            "section_title": section_title.strip(),
+                            "line_num": line_num,
+                            "text": line
+                        })
+                    else:
+                        # For unnumbered patterns
+                        section_title = matches.group(1)
+                        sections.append({
+                            "section_num": None,
+                            "section_title": section_title.strip(),
+                            "line_num": line_num,
+                            "text": line
+                        })
+                    break
+        return sections
+    def split_into_sections(self, full_text: str, filename: str) -> Dict[str, str]:
+        """
+        Split the full text into logical sections based on identified section titles.
+        Returns:
+            Dictionary mapping section names to their text content
+        """
+        # First identify potential section titles
+        lines = full_text.split('\n')
+        section_markers = self.identify_section_titles(full_text)
+        if not section_markers:
+            # If no sections found, treat the whole document as one section
+            return {"document": full_text}
+        # Sort section markers by line number
+        section_markers.sort(key=lambda x: x["line_num"])
+        # Create sections
+        sections = {}
+        for i in range(len(section_markers)):
+            start_line = section_markers[i]["line_num"]
+            section_name = section_markers[i]["section_title"]
+            # Determine end line (next section or end of document)
+            if i < len(section_markers) - 1:
+                end_line = section_markers[i+1]["line_num"]
+            else:
+                end_line = len(lines)
+            # Extract section text
+            section_text = '\n'.join(lines[start_line:end_line])
+            sections[section_name] = section_text
+        return sections
+    def chunk_text(self, text: str, metadata: Dict[str, Any],
+                  chunk_size: int = 1000, overlap: int = 200) -> List[Dict]:
+        """
+        Split text into chunks suitable for embedding.
+        Args:
+            text: Text to chunk
+            metadata: Metadata to include with each chunk
+            chunk_size: Maximum size of each chunk
+            overlap: Overlap between chunks
+        Returns:
+            List of dictionaries with page_content and metadata
+        """
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=overlap,
+            length_function=len,
+        )
+        chunks = text_splitter.create_documents(
+            [text],
+            metadatas=[metadata]
+        )
+        return [{"page_content": chunk.page_content, "metadata": chunk.metadata} for chunk in chunks]
+    def process_document_for_vector_store(self, pdf_path: str,
+                                         document_metadata: Dict[str, Any]) -> List[Dict]:
+        """
+        Process a document for storage in the vector store.
+        Extract text, split into chunks, and add metadata.
+        Args:
+            pdf_path: Path to the PDF file
+            document_metadata: Metadata about the document
+        Returns:
+            List of dictionaries with page_content and metadata ready for vector store
+        """
+        full_text, pages = self.extract_text_from_pdf(pdf_path)
+        sections = self.split_into_sections(full_text, os.path.basename(pdf_path))
+        all_chunks = []
+        # Process each section as its own set of chunks
+        for section_name, section_text in sections.items():
+            section_metadata = document_metadata.copy()
+            section_metadata.update({
+                "section": section_name,
+                "source": os.path.basename(pdf_path)
+            })
+            chunks = self.chunk_text(section_text, section_metadata)
+            all_chunks.extend(chunks)
+        return all_chunks
+    def extract_tables_from_pdf(self, pdf_path: str) -> List[Dict]:
+        """
+        Attempt to extract tables from the PDF.
+        This is a simplified implementation and may not work for all PDFs.
+        Returns:
+            List of dictionaries with table info including page number and content
+        """
+        # This is a placeholder. Table extraction from PDFs is complex and often
+        # requires specialized libraries or even manual extraction/OCR
+        # For a production system, consider tools like Camelot, Tabula, or commercial APIs
+        return []  # Placeholder for actual table extraction
+    def identify_document_type(self, text: str, filename: str) -> str:
+        """
+        Attempt to identify the type of document (Protocol, SAP, etc.)
+        based on content and filename patterns.
+        Returns:
+            String indicating document type
+        """
+        lower_text = text.lower()
+        lower_filename = filename.lower()
+        # Check filename patterns
+        if "protocol" in lower_filename or "prot" in lower_filename:
+            return "Protocol"
+        elif "sap" in lower_filename or "analysis plan" in lower_filename:
+            return "Statistical Analysis Plan"
+        elif "csr" in lower_filename or "study report" in lower_filename:
+            return "Clinical Study Report"
+        elif "ib" in lower_filename or "investigator" in lower_filename and "brochure" in lower_filename:
+            return "Investigator Brochure"
+        # Check content patterns
+        if "statistical analysis plan" in lower_text:
+            return "Statistical Analysis Plan"
+        elif "clinical study protocol" in lower_text or "study protocol" in lower_text:
+            return "Protocol"
+        elif "clinical study report" in lower_text:
+            return "Clinical Study Report"
+        elif "investigator's brochure" in lower_text or "investigator brochure" in lower_text:
+            return "Investigator Brochure"
+        # Default
+        return "Unknown"
+    def extract_protocol_id(self, text: str, filename: str) -> Optional[str]:
+        """
+        Attempt to extract the protocol ID from the document text or filename.
+        Returns:
+            Protocol ID string if found, None otherwise
+        """
+        # Common patterns for protocol IDs
+        patterns = [
+            # Common format like: Protocol B9531002
+            r'[Pp]rotocol\s+([A-Z][0-9]{5,}[A-Z0-9]*)',
+            # Format with hyphen like: C5161-001
+            r'([A-Z][0-9]{4,}-[0-9]{3})',
+            # Standard pattern like: ABC-123-456
+            r'([A-Z]{2,5}-[0-9]{2,3}-[0-9]{2,3})',
+            # Simple alphanumeric like: XYZ12345
+            r'([A-Z]{2,5}[0-9]{4,6})'
+        ]
+        # Try to find in the first few hundred characters (often in the title)
+        sample_text = text[:1000]
+        for pattern in patterns:
+            matches = re.search(pattern, sample_text)
+            if matches:
+                return matches.group(1)
+        # Check filename
+        for pattern in patterns:
+            matches = re.search(pattern, filename)
+            if matches:
+                return matches.group(1)
+        return None
+    def extract_basic_metadata(self, pdf_path: str) -> Dict[str, Any]:
+        """
+        Extract basic metadata from a PDF without detailed structure extraction.
+        Returns:
+            Dictionary with basic document metadata
+        """
+        filename = os.path.basename(pdf_path)
+        full_text, _ = self.extract_text_from_pdf(pdf_path)
+        # Sample the first part of the document
+        sample_text = full_text[:5000]
+        # Extract potential protocol ID
+        protocol_id = self.extract_protocol_id(sample_text, filename)
+        # Determine document type
+        doc_type = self.identify_document_type(sample_text, filename)
+        # Extract title (usually in the first few lines)
+        lines = sample_text.split('\n')
+        title = next((line.strip() for line in lines if len(line.strip()) > 20 and len(line.strip()) < 200), "Unknown Title")
+        # Create basic metadata
+        metadata = {
+            "document_id": os.path.splitext(filename)[0],
+            "filename": filename,
+            "protocol_id": protocol_id,
+            "type": doc_type,
+            "title": title,
+            "path": pdf_path
+        }
+        return metadata
+    def process_complete_document(self, pdf_path: str) -> Dict[str, Any]:
+        """
+        Process a complete document for both structured data and vector storage.
+        This is the main entry point for document processing.
+        Returns:
+            Dictionary with processing results
+        """
+        results = {
+            "status": "success",
+            "pdf_path": pdf_path,
+            "filename": os.path.basename(pdf_path)
+        }
+        try:
+            # Step 1: Extract basic metadata
+            metadata = self.extract_basic_metadata(pdf_path)
+            results["metadata"] = metadata
+            # Step 2: Extract full text and split into sections
+            full_text, pages = self.extract_text_from_pdf(pdf_path)
+            sections = self.split_into_sections(full_text, os.path.basename(pdf_path))
+            results["sections"] = list(sections.keys())
+            results["page_count"] = len(pages)
+            # Step 3: Prepare chunks for vector store
+            chunks = self.process_document_for_vector_store(pdf_path, metadata)
+            results["chunk_count"] = len(chunks)
+            results["chunks"] = chunks
+            return results
+        except Exception as e:
+            results["status"] = "error"
+            results["error"] = str(e)
+            return results