File size: 8,063 Bytes
13d5ab4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
"""
PDF Processing Module - Layer 1: PDF Understanding
Handles multimodal extraction: text, images, tables
"""

import PyPDF2
import fitz  # PyMuPDF
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import logging
from typing import Dict, List, Any, Optional
import io
import numpy as np

logger = logging.getLogger(__name__)


class PDFProcessor:
    """
    Comprehensive PDF processing for medical documents
    Implements hybrid extraction: native text + OCR fallback
    """
    
    def __init__(self):
        self.supported_formats = ['.pdf']
        logger.info("PDF Processor initialized")
    
    async def extract_content(self, file_path: str) -> Dict[str, Any]:
        """
        Extract multimodal content from PDF
        
        Returns:
            Dict with:
            - text: extracted text content
            - images: list of extracted images
            - tables: detected tabular content
            - metadata: document metadata
            - page_count: number of pages
        """
        try:
            logger.info(f"Starting PDF extraction: {file_path}")
            
            # Initialize result structure
            result = {
                "text": "",
                "images": [],
                "tables": [],
                "metadata": {},
                "page_count": 0,
                "extraction_method": "hybrid"
            }
            
            # Open PDF with PyMuPDF for robust extraction
            doc = fitz.open(file_path)
            result["page_count"] = len(doc)
            result["metadata"] = self._extract_metadata(doc)
            
            all_text = []
            all_images = []
            
            # Process each page
            for page_num in range(len(doc)):
                page = doc[page_num]
                
                # Extract text
                page_text = page.get_text()
                
                # If native text extraction fails, use OCR
                if not page_text.strip():
                    logger.info(f"Page {page_num + 1}: Using OCR (no native text)")
                    page_text = await self._ocr_page(file_path, page_num)
                    result["extraction_method"] = "hybrid_with_ocr"
                
                all_text.append(page_text)
                
                # Extract images from page
                page_images = self._extract_images_from_page(page, page_num)
                all_images.extend(page_images)
                
                # Detect tables (simplified detection)
                tables = self._detect_tables(page_text)
                result["tables"].extend(tables)
            
            result["text"] = "\n\n".join(all_text)
            result["images"] = all_images
            
            # Extract structured sections
            result["sections"] = self._extract_sections(result["text"])
            
            doc.close()
            
            logger.info(f"PDF extraction complete: {result['page_count']} pages, "
                       f"{len(result['images'])} images, {len(result['tables'])} tables")
            
            return result
            
        except Exception as e:
            logger.error(f"PDF extraction failed: {str(e)}")
            raise
    
    def _extract_metadata(self, doc: fitz.Document) -> Dict[str, Any]:
        """Extract PDF metadata"""
        metadata = {}
        try:
            pdf_metadata = doc.metadata
            metadata = {
                "title": pdf_metadata.get("title", ""),
                "author": pdf_metadata.get("author", ""),
                "subject": pdf_metadata.get("subject", ""),
                "creator": pdf_metadata.get("creator", ""),
                "producer": pdf_metadata.get("producer", ""),
                "creation_date": pdf_metadata.get("creationDate", ""),
                "modification_date": pdf_metadata.get("modDate", "")
            }
        except Exception as e:
            logger.warning(f"Metadata extraction failed: {str(e)}")
        
        return metadata
    
    async def _ocr_page(self, file_path: str, page_num: int) -> str:
        """Perform OCR on a single page"""
        try:
            # Convert PDF page to image
            images = convert_from_path(
                file_path,
                first_page=page_num + 1,
                last_page=page_num + 1,
                dpi=300
            )
            
            if images:
                # Perform OCR
                text = pytesseract.image_to_string(images[0])
                return text
            
            return ""
            
        except Exception as e:
            logger.warning(f"OCR failed for page {page_num + 1}: {str(e)}")
            return ""
    
    def _extract_images_from_page(self, page: fitz.Page, page_num: int) -> List[Dict[str, Any]]:
        """Extract images from a PDF page"""
        images = []
        try:
            image_list = page.get_images(full=True)
            
            for img_index, img_info in enumerate(image_list):
                images.append({
                    "page": page_num + 1,
                    "index": img_index,
                    "xref": img_info[0],
                    "width": img_info[2],
                    "height": img_info[3]
                })
        except Exception as e:
            logger.warning(f"Image extraction failed for page {page_num + 1}: {str(e)}")
        
        return images
    
    def _detect_tables(self, text: str) -> List[Dict[str, Any]]:
        """
        Detect tabular content in text
        Simplified heuristic-based detection
        """
        tables = []
        
        # Look for common table patterns
        lines = text.split('\n')
        potential_table = []
        in_table = False
        
        for line in lines:
            # Simple heuristic: lines with multiple tabs or pipes
            if '\t' in line or '|' in line or line.count('  ') > 3:
                potential_table.append(line)
                in_table = True
            elif in_table and potential_table:
                # End of table
                if len(potential_table) >= 2:  # At least header + 1 row
                    tables.append({
                        "rows": potential_table,
                        "row_count": len(potential_table)
                    })
                potential_table = []
                in_table = False
        
        return tables
    
    def _extract_sections(self, text: str) -> Dict[str, str]:
        """
        Extract common medical report sections
        """
        sections = {}
        
        # Common section headers in medical reports
        section_headers = [
            "HISTORY", "PHYSICAL EXAMINATION", "ASSESSMENT", "PLAN",
            "CHIEF COMPLAINT", "DIAGNOSIS", "FINDINGS", "IMPRESSION",
            "RECOMMENDATIONS", "LAB RESULTS", "MEDICATIONS", "ALLERGIES",
            "VITAL SIGNS", "PAST MEDICAL HISTORY", "FAMILY HISTORY",
            "SOCIAL HISTORY", "REVIEW OF SYSTEMS"
        ]
        
        lines = text.split('\n')
        current_section = "GENERAL"
        current_content = []
        
        for line in lines:
            line_upper = line.strip().upper()
            
            # Check if line is a section header
            is_header = False
            for header in section_headers:
                if header in line_upper and len(line.strip()) < 50:
                    # Save previous section
                    if current_content:
                        sections[current_section] = '\n'.join(current_content)
                    
                    current_section = header
                    current_content = []
                    is_header = True
                    break
            
            if not is_header:
                current_content.append(line)
        
        # Save last section
        if current_content:
            sections[current_section] = '\n'.join(current_content)
        
        return sections