File size: 13,240 Bytes
46eb9e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88bb2e2
46eb9e8
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
"""
PDF processing utilities for extracting text, sections, and structured data from clinical documents.
"""

import os
import re
import fitz  # PyMuPDF
from typing import Dict, List, Tuple, Optional, Any
import json
from collections import defaultdict
from langchain.text_splitter import RecursiveCharacterTextSplitter

class PDFProcessor:
    """Main class for PDF processing, extraction, and chunking."""
    
    def __init__(self, upload_dir="./data/uploads"):
        """Initialize with the directory for uploaded PDFs."""
        self.upload_dir = upload_dir
        os.makedirs(upload_dir, exist_ok=True)

    def save_uploaded_file(self, uploaded_file) -> str:
        """Save an uploaded file to disk and return the path."""
        file_path = os.path.join(self.upload_dir, uploaded_file.name)
        with open(file_path, "wb") as f:
            f.write(uploaded_file.getbuffer())
        return file_path

    def extract_text_from_pdf(self, pdf_path: str) -> Tuple[str, List[Dict]]:
        """
        Extract text from PDF with page numbers and attempt to identify section headers.
        
        Returns:
            Tuple containing:
                - Full text string
                - List of pages with text and page numbers
        """
        try:
            doc = fitz.open(pdf_path)
            full_text = ""
            pages = []
            
            for page_num, page in enumerate(doc):
                text = page.get_text()
                full_text += text + "\n\n"
                pages.append({
                    "page_num": page_num + 1,
                    "text": text
                })
            
            doc.close()
            return full_text, pages
        except Exception as e:
            print(f"Error extracting text from PDF {pdf_path}: {e}")
            return "", []

    def identify_section_titles(self, text: str) -> List[Dict]:
        """
        Identify potential section titles based on common patterns in clinical documents.
        
        Returns:
            List of dictionaries with section title and position info
        """
        # Common patterns for section headers in protocols and SAPs
        patterns = [
            # Numbered sections like "1. INTRODUCTION" or "2.3 Statistical Analysis"
            r'^(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z\s]+)$',
            # ALL CAPS headers like "OBJECTIVES AND ENDPOINTS"
            r'^([A-Z][A-Z\s]{3,})$',
            # Title case headers with optional trailing colon
            r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,5}):?$'
        ]
        
        sections = []
        for line_num, line in enumerate(text.split('\n')):
            line = line.strip()
            if not line:
                continue
                
            for pattern in patterns:
                matches = re.match(pattern, line)
                if matches:
                    if len(matches.groups()) > 1:
                        # For numbered patterns
                        section_num, section_title = matches.groups()
                        sections.append({
                            "section_num": section_num,
                            "section_title": section_title.strip(),
                            "line_num": line_num,
                            "text": line
                        })
                    else:
                        # For unnumbered patterns
                        section_title = matches.group(1)
                        sections.append({
                            "section_num": None,
                            "section_title": section_title.strip(),
                            "line_num": line_num,
                            "text": line
                        })
                    break
        
        return sections

    def split_into_sections(self, full_text: str, filename: str) -> Dict[str, str]:
        """
        Split the full text into logical sections based on identified section titles.
        
        Returns:
            Dictionary mapping section names to their text content
        """
        # First identify potential section titles
        lines = full_text.split('\n')
        section_markers = self.identify_section_titles(full_text)
        
        if not section_markers:
            # If no sections found, treat the whole document as one section
            return {"document": full_text}
        
        # Sort section markers by line number
        section_markers.sort(key=lambda x: x["line_num"])
        
        # Create sections
        sections = {}
        for i in range(len(section_markers)):
            start_line = section_markers[i]["line_num"]
            section_name = section_markers[i]["section_title"]
            
            # Determine end line (next section or end of document)
            if i < len(section_markers) - 1:
                end_line = section_markers[i+1]["line_num"]
            else:
                end_line = len(lines)
            
            # Extract section text
            section_text = '\n'.join(lines[start_line:end_line])
            sections[section_name] = section_text
        
        return sections

    def chunk_text(self, text: str, metadata: Dict[str, Any], 
                  chunk_size: int = 1000, overlap: int = 200) -> List[Dict]:
        """
        Split text into chunks suitable for embedding.
        
        Args:
            text: Text to chunk
            metadata: Metadata to include with each chunk
            chunk_size: Maximum size of each chunk
            overlap: Overlap between chunks
            
        Returns:
            List of dictionaries with page_content and metadata
        """
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=overlap,
            length_function=len,
        )
        
        chunks = text_splitter.create_documents(
            [text], 
            metadatas=[metadata]
        )
        
        return [{"page_content": chunk.page_content, "metadata": chunk.metadata} for chunk in chunks]

    def process_document_for_vector_store(self, pdf_path: str, 
                                         document_metadata: Dict[str, Any]) -> List[Dict]:
        """
        Process a document for storage in the vector store.
        Extract text, split into chunks, and add metadata.
        
        Args:
            pdf_path: Path to the PDF file
            document_metadata: Metadata about the document
            
        Returns:
            List of dictionaries with page_content and metadata ready for vector store
        """
        full_text, pages = self.extract_text_from_pdf(pdf_path)
        sections = self.split_into_sections(full_text, os.path.basename(pdf_path))
        
        all_chunks = []
        
        # Process each section as its own set of chunks
        for section_name, section_text in sections.items():
            section_metadata = document_metadata.copy()
            section_metadata.update({
                "section": section_name,
                "source": os.path.basename(pdf_path)
            })
            
            chunks = self.chunk_text(section_text, section_metadata)
            all_chunks.extend(chunks)
        
        return all_chunks

    def extract_tables_from_pdf(self, pdf_path: str) -> List[Dict]:
        """
        Attempt to extract tables from the PDF.
        This is a simplified implementation and may not work for all PDFs.
        
        Returns:
            List of dictionaries with table info including page number and content
        """
        # This is a placeholder. Table extraction from PDFs is complex and often
        # requires specialized libraries or even manual extraction/OCR
        # For a production system, consider tools like Camelot, Tabula, or commercial APIs
        
        return []  # Placeholder for actual table extraction

    def identify_document_type(self, text: str, filename: str) -> str:
        """
        Attempt to identify the type of document (Protocol, SAP, etc.)
        based on content and filename patterns.
        
        Returns:
            String indicating document type
        """
        lower_text = text.lower()
        lower_filename = filename.lower()
        
        # Check filename patterns
        if "protocol" in lower_filename or "prot" in lower_filename:
            return "Protocol"
        elif "sap" in lower_filename or "analysis plan" in lower_filename:
            return "Statistical Analysis Plan"
        elif "csr" in lower_filename or "study report" in lower_filename:
            return "Clinical Study Report"
        elif "ib" in lower_filename or "investigator" in lower_filename and "brochure" in lower_filename:
            return "Investigator Brochure"
        
        # Check content patterns
        if "statistical analysis plan" in lower_text:
            return "Statistical Analysis Plan"
        elif "clinical study protocol" in lower_text or "study protocol" in lower_text:
            return "Protocol"
        elif "clinical study report" in lower_text:
            return "Clinical Study Report"
        elif "investigator's brochure" in lower_text or "investigator brochure" in lower_text:
            return "Investigator Brochure"
        
        # Default
        return "Unknown"

    def extract_protocol_id(self, text: str, filename: str) -> Optional[str]:
        """
        Attempt to extract the protocol ID from the document text or filename.
        
        Returns:
            Protocol ID string if found, None otherwise
        """
        # Common patterns for protocol IDs
        patterns = [
            # Common format like: Protocol B9531002
            r'[Pp]rotocol\s+([A-Z][0-9]{5,}[A-Z0-9]*)',
            # Format with hyphen like: C5161-001
            r'([A-Z][0-9]{4,}-[0-9]{3})',
            # Standard pattern like: ABC-123-456
            r'([A-Z]{2,5}-[0-9]{2,3}-[0-9]{2,3})',
            # Simple alphanumeric like: XYZ12345
            r'([A-Z]{2,5}[0-9]{4,6})'
        ]
        
        # Try to find in the first few hundred characters (often in the title)
        sample_text = text[:1000]
        
        for pattern in patterns:
            matches = re.search(pattern, sample_text)
            if matches:
                return matches.group(1)
        
        # Check filename
        for pattern in patterns:
            matches = re.search(pattern, filename)
            if matches:
                return matches.group(1)
        
        return None

    def extract_basic_metadata(self, pdf_path: str) -> Dict[str, Any]:
        """
        Extract basic metadata from a PDF without detailed structure extraction.
        
        Returns:
            Dictionary with basic document metadata
        """
        filename = os.path.basename(pdf_path)
        full_text, _ = self.extract_text_from_pdf(pdf_path)
        
        # Sample the first part of the document
        sample_text = full_text[:5000]
        
        # Extract potential protocol ID
        protocol_id = self.extract_protocol_id(sample_text, filename)
        
        # Determine document type
        doc_type = self.identify_document_type(sample_text, filename)
        
        # Extract title (usually in the first few lines)
        lines = sample_text.split('\n')
        title = next((line.strip() for line in lines if len(line.strip()) > 20 and len(line.strip()) < 200), "Unknown Title")
        
        # Create basic metadata
        metadata = {
            "document_id": os.path.splitext(filename)[0],
            "filename": filename,
            "protocol_id": protocol_id,
            "type": doc_type,
            "title": title,
            "path": pdf_path
        }
        
        return metadata
    
    def process_complete_document(self, pdf_path: str) -> Dict[str, Any]:
        """
        Process a complete document for both structured data and vector storage.
        This is the main entry point for document processing.
        
        Returns:
            Dictionary with processing results
        """
        results = {
            "status": "success",
            "pdf_path": pdf_path,
            "filename": os.path.basename(pdf_path)
        }
        
        try:
            # Step 1: Extract basic metadata
            metadata = self.extract_basic_metadata(pdf_path)
            results["metadata"] = metadata
            
            # Step 2: Extract full text and split into sections
            full_text, pages = self.extract_text_from_pdf(pdf_path)
            sections = self.split_into_sections(full_text, os.path.basename(pdf_path))
            results["sections"] = sections  # Store the entire sections dictionary
            results["page_count"] = len(pages)
            
            # Step 3: Prepare chunks for vector store
            chunks = self.process_document_for_vector_store(pdf_path, metadata)
            results["chunk_count"] = len(chunks)
            results["chunks"] = chunks
            
            return results
        except Exception as e:
            results["status"] = "error"
            results["error"] = str(e)
            return results