File size: 8,208 Bytes
2eb2e3c
 
 
 
 
 
 
 
8e2b9ee
2eb2e3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
from typing import Dict, List, Optional, Tuple
import cv2
import numpy as np
from paddleocr import PaddleOCR
import easyocr
from PIL import Image
import io
import google.generativeai as genai
from src.document_config import DocumentType, DocumentRequirement, HealthcareProcess, get_process_requirements
import json
import os
from dotenv import load_dotenv

load_dotenv()

class DocumentOCRService:
    def __init__(self):
        # Initialize PaddleOCR with Indonesian language support
        self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang='id')
        # Initialize EasyOCR with Indonesian language support
        self.easy_ocr = easyocr.Reader(['id'])
        # Initialize Gemini
        genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
        self.gemini_model = genai.GenerativeModel('gemini-pro-vision')
        
    def preprocess_image(self, image: np.ndarray) -> np.ndarray:
        """Preprocess the image for better OCR results."""
        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        # Apply adaptive thresholding
        thresh = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
        )
        return thresh

    def process_document(self, image: np.ndarray, document_type: str, process_type: Optional[str] = None) -> Dict:
        """
        Process different types of Indonesian documents.
        
        Args:
            image: Input image as numpy array
            document_type: Type of document (KTP, KK, BPJS, etc.)
            process_type: Type of healthcare process (optional)
            
        Returns:
            Dictionary containing extracted information and validation results
        """
        # Preprocess the image
        processed_image = self.preprocess_image(image)
        
        # Get OCR results
        if document_type in ['KTP', 'KK', 'BPJS']:
            ocr_result = self.paddle_ocr.ocr(processed_image, cls=True)
            extracted_data = self._parse_structured_document(ocr_result, document_type)
        else:
            ocr_result = self.easy_ocr.readtext(processed_image)
            extracted_data = self._parse_unstructured_document(ocr_result, document_type)
        
        # Use Gemini VLM for additional analysis
        gemini_analysis = self._analyze_with_gemini(image, document_type, extracted_data)
        
        # Validate against process requirements if process_type is provided
        validation_result = None
        if process_type:
            validation_result = self._validate_against_process(
                extracted_data, 
                document_type, 
                HealthcareProcess(process_type)
            )
        
        return {
            "extracted_data": extracted_data,
            "gemini_analysis": gemini_analysis,
            "validation_result": validation_result
        }

    def _analyze_with_gemini(self, image: np.ndarray, document_type: str, ocr_data: Dict) -> Dict:
        """Analyze document using Gemini VLM."""
        # Convert image to bytes
        _, buffer = cv2.imencode('.jpg', image)
        image_bytes = buffer.tobytes()
        
        # Prepare prompt for Gemini
        prompt = f"""
        Analyze this {document_type} document and provide:
        1. Document authenticity assessment
        2. Any potential issues or inconsistencies
        3. Additional information not captured by OCR
        4. Recommendations for document improvement if needed
        
        OCR extracted data: {json.dumps(ocr_data, indent=2)}
        """
        
        # Get Gemini's analysis
        response = self.gemini_model.generate_content([prompt, image_bytes])
        
        return {
            "analysis": response.text,
            "confidence_score": response.candidates[0].score if hasattr(response, 'candidates') else None
        }

    def _validate_against_process(
        self, 
        extracted_data: Dict, 
        document_type: str, 
        process: HealthcareProcess
    ) -> Dict:
        """Validate document against process requirements."""
        requirements = get_process_requirements(process)
        document_requirement = next(
            (req for req in requirements if req.document_type.value == document_type),
            None
        )
        
        if not document_requirement:
            return {
                "is_valid": False,
                "message": f"Document type {document_type} is not required for this process"
            }
        
        # Check required fields
        missing_fields = []
        for field, rule in document_requirement.validation_rules.items():
            if rule == "required" and field not in extracted_data:
                missing_fields.append(field)
        
        return {
            "is_valid": len(missing_fields) == 0,
            "missing_fields": missing_fields,
            "document_requirement": document_requirement.description
        }

    def get_process_requirements(self, process: HealthcareProcess) -> Dict:
        """Get document requirements for a specific process."""
        requirements = get_process_requirements(process)
        return {
            "required_documents": [
                {
                    "type": req.document_type.value,
                    "description": req.description,
                    "validation_rules": req.validation_rules
                }
                for req in requirements if req.is_required
            ],
            "optional_documents": [
                {
                    "type": req.document_type.value,
                    "description": req.description,
                    "validation_rules": req.validation_rules
                }
                for req in requirements if not req.is_required
            ]
        }

    def _parse_structured_document(self, ocr_result: List, document_type: str) -> Dict:
        """Parse results from structured documents like KTP, KK, BPJS."""
        extracted_data = {}
        
        if document_type == 'KTP':
            # Define KTP field mappings
            field_mappings = {
                'NIK': r'NIK',
                'Nama': r'Nama',
                'Tempat/Tgl Lahir': r'Tempat/Tgl Lahir',
                'Alamat': r'Alamat',
                'RT/RW': r'RT/RW',
                'Kel/Desa': r'Kel/Desa',
                'Kecamatan': r'Kecamatan',
                'Agama': r'Agama',
                'Status Perkawinan': r'Status Perkawinan',
                'Pekerjaan': r'Pekerjaan',
                'Kewarganegaraan': r'Kewarganegaraan'
            }
            # Process OCR results and map to fields
            for line in ocr_result:
                text = line[1][0]  # Get the text
                for field, pattern in field_mappings.items():
                    if pattern.lower() in text.lower():
                        # Extract the value after the field name
                        value = text.split(pattern)[-1].strip()
                        extracted_data[field] = value
            
        elif document_type == 'BPJS':
            # Define BPJS field mappings
            field_mappings = {
                'Nomor Kartu': r'Nomor Kartu',
                'Nama Peserta': r'Nama Peserta',
                'No. KTP': r'No. KTP',
                'Faskes Tingkat 1': r'Faskes Tingkat 1'
            }
            # Process OCR results and map to fields
            for line in ocr_result:
                text = line[1][0]  # Get the text
                for field, pattern in field_mappings.items():
                    if pattern.lower() in text.lower():
                        value = text.split(pattern)[-1].strip()
                        extracted_data[field] = value
            
        return extracted_data

    def _parse_unstructured_document(self, ocr_result: List, document_type: str) -> Dict:
        """Parse results from unstructured documents like medical bills, prescriptions."""
        extracted_data = {}
        # Combine all text for Gemini analysis
        full_text = " ".join([text for _, text, _ in ocr_result])
        extracted_data["full_text"] = full_text
        return extracted_data