ocr-ktp / src /ocr_service.py
cheesecz's picture
Update src/ocr_service.py
8e2b9ee verified
from typing import Dict, List, Optional, Tuple
import cv2
import numpy as np
from paddleocr import PaddleOCR
import easyocr
from PIL import Image
import io
import google.generativeai as genai
from src.document_config import DocumentType, DocumentRequirement, HealthcareProcess, get_process_requirements
import json
import os
from dotenv import load_dotenv
load_dotenv()
class DocumentOCRService:
def __init__(self):
# Initialize PaddleOCR with Indonesian language support
self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang='id')
# Initialize EasyOCR with Indonesian language support
self.easy_ocr = easyocr.Reader(['id'])
# Initialize Gemini
genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
self.gemini_model = genai.GenerativeModel('gemini-pro-vision')
def preprocess_image(self, image: np.ndarray) -> np.ndarray:
"""Preprocess the image for better OCR results."""
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply adaptive thresholding
thresh = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)
return thresh
def process_document(self, image: np.ndarray, document_type: str, process_type: Optional[str] = None) -> Dict:
"""
Process different types of Indonesian documents.
Args:
image: Input image as numpy array
document_type: Type of document (KTP, KK, BPJS, etc.)
process_type: Type of healthcare process (optional)
Returns:
Dictionary containing extracted information and validation results
"""
# Preprocess the image
processed_image = self.preprocess_image(image)
# Get OCR results
if document_type in ['KTP', 'KK', 'BPJS']:
ocr_result = self.paddle_ocr.ocr(processed_image, cls=True)
extracted_data = self._parse_structured_document(ocr_result, document_type)
else:
ocr_result = self.easy_ocr.readtext(processed_image)
extracted_data = self._parse_unstructured_document(ocr_result, document_type)
# Use Gemini VLM for additional analysis
gemini_analysis = self._analyze_with_gemini(image, document_type, extracted_data)
# Validate against process requirements if process_type is provided
validation_result = None
if process_type:
validation_result = self._validate_against_process(
extracted_data,
document_type,
HealthcareProcess(process_type)
)
return {
"extracted_data": extracted_data,
"gemini_analysis": gemini_analysis,
"validation_result": validation_result
}
def _analyze_with_gemini(self, image: np.ndarray, document_type: str, ocr_data: Dict) -> Dict:
"""Analyze document using Gemini VLM."""
# Convert image to bytes
_, buffer = cv2.imencode('.jpg', image)
image_bytes = buffer.tobytes()
# Prepare prompt for Gemini
prompt = f"""
Analyze this {document_type} document and provide:
1. Document authenticity assessment
2. Any potential issues or inconsistencies
3. Additional information not captured by OCR
4. Recommendations for document improvement if needed
OCR extracted data: {json.dumps(ocr_data, indent=2)}
"""
# Get Gemini's analysis
response = self.gemini_model.generate_content([prompt, image_bytes])
return {
"analysis": response.text,
"confidence_score": response.candidates[0].score if hasattr(response, 'candidates') else None
}
def _validate_against_process(
self,
extracted_data: Dict,
document_type: str,
process: HealthcareProcess
) -> Dict:
"""Validate document against process requirements."""
requirements = get_process_requirements(process)
document_requirement = next(
(req for req in requirements if req.document_type.value == document_type),
None
)
if not document_requirement:
return {
"is_valid": False,
"message": f"Document type {document_type} is not required for this process"
}
# Check required fields
missing_fields = []
for field, rule in document_requirement.validation_rules.items():
if rule == "required" and field not in extracted_data:
missing_fields.append(field)
return {
"is_valid": len(missing_fields) == 0,
"missing_fields": missing_fields,
"document_requirement": document_requirement.description
}
def get_process_requirements(self, process: HealthcareProcess) -> Dict:
"""Get document requirements for a specific process."""
requirements = get_process_requirements(process)
return {
"required_documents": [
{
"type": req.document_type.value,
"description": req.description,
"validation_rules": req.validation_rules
}
for req in requirements if req.is_required
],
"optional_documents": [
{
"type": req.document_type.value,
"description": req.description,
"validation_rules": req.validation_rules
}
for req in requirements if not req.is_required
]
}
def _parse_structured_document(self, ocr_result: List, document_type: str) -> Dict:
"""Parse results from structured documents like KTP, KK, BPJS."""
extracted_data = {}
if document_type == 'KTP':
# Define KTP field mappings
field_mappings = {
'NIK': r'NIK',
'Nama': r'Nama',
'Tempat/Tgl Lahir': r'Tempat/Tgl Lahir',
'Alamat': r'Alamat',
'RT/RW': r'RT/RW',
'Kel/Desa': r'Kel/Desa',
'Kecamatan': r'Kecamatan',
'Agama': r'Agama',
'Status Perkawinan': r'Status Perkawinan',
'Pekerjaan': r'Pekerjaan',
'Kewarganegaraan': r'Kewarganegaraan'
}
# Process OCR results and map to fields
for line in ocr_result:
text = line[1][0] # Get the text
for field, pattern in field_mappings.items():
if pattern.lower() in text.lower():
# Extract the value after the field name
value = text.split(pattern)[-1].strip()
extracted_data[field] = value
elif document_type == 'BPJS':
# Define BPJS field mappings
field_mappings = {
'Nomor Kartu': r'Nomor Kartu',
'Nama Peserta': r'Nama Peserta',
'No. KTP': r'No. KTP',
'Faskes Tingkat 1': r'Faskes Tingkat 1'
}
# Process OCR results and map to fields
for line in ocr_result:
text = line[1][0] # Get the text
for field, pattern in field_mappings.items():
if pattern.lower() in text.lower():
value = text.split(pattern)[-1].strip()
extracted_data[field] = value
return extracted_data
def _parse_unstructured_document(self, ocr_result: List, document_type: str) -> Dict:
"""Parse results from unstructured documents like medical bills, prescriptions."""
extracted_data = {}
# Combine all text for Gemini analysis
full_text = " ".join([text for _, text, _ in ocr_result])
extracted_data["full_text"] = full_text
return extracted_data