Spaces:

cheesecz
/

ocr-ktp

Build error

App Files Files Community

ocr-ktp / src /ocr_service.py

cheesecz

Update src/ocr_service.py

8e2b9ee verified 9 months ago

raw

history blame contribute delete

8.21 kB

	from typing import Dict, List, Optional, Tuple
	import cv2
	import numpy as np
	from paddleocr import PaddleOCR
	import easyocr
	from PIL import Image
	import io
	import google.generativeai as genai
	from src.document_config import DocumentType, DocumentRequirement, HealthcareProcess, get_process_requirements
	import json
	import os
	from dotenv import load_dotenv

	load_dotenv()

	class DocumentOCRService:
	def __init__(self):
	# Initialize PaddleOCR with Indonesian language support
	self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang='id')
	# Initialize EasyOCR with Indonesian language support
	self.easy_ocr = easyocr.Reader(['id'])
	# Initialize Gemini
	genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
	self.gemini_model = genai.GenerativeModel('gemini-pro-vision')

	def preprocess_image(self, image: np.ndarray) -> np.ndarray:
	"""Preprocess the image for better OCR results."""
	# Convert to grayscale
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	# Apply adaptive thresholding
	thresh = cv2.adaptiveThreshold(
	gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
	)
	return thresh

	def process_document(self, image: np.ndarray, document_type: str, process_type: Optional[str] = None) -> Dict:
	"""
	Process different types of Indonesian documents.

	Args:
	image: Input image as numpy array
	document_type: Type of document (KTP, KK, BPJS, etc.)
	process_type: Type of healthcare process (optional)

	Returns:
	Dictionary containing extracted information and validation results
	"""
	# Preprocess the image
	processed_image = self.preprocess_image(image)

	# Get OCR results
	if document_type in ['KTP', 'KK', 'BPJS']:
	ocr_result = self.paddle_ocr.ocr(processed_image, cls=True)
	extracted_data = self._parse_structured_document(ocr_result, document_type)
	else:
	ocr_result = self.easy_ocr.readtext(processed_image)
	extracted_data = self._parse_unstructured_document(ocr_result, document_type)

	# Use Gemini VLM for additional analysis
	gemini_analysis = self._analyze_with_gemini(image, document_type, extracted_data)

	# Validate against process requirements if process_type is provided
	validation_result = None
	if process_type:
	validation_result = self._validate_against_process(
	extracted_data,
	document_type,
	HealthcareProcess(process_type)
	)

	return {
	"extracted_data": extracted_data,
	"gemini_analysis": gemini_analysis,
	"validation_result": validation_result
	}

	def _analyze_with_gemini(self, image: np.ndarray, document_type: str, ocr_data: Dict) -> Dict:
	"""Analyze document using Gemini VLM."""
	# Convert image to bytes
	_, buffer = cv2.imencode('.jpg', image)
	image_bytes = buffer.tobytes()

	# Prepare prompt for Gemini
	prompt = f"""
	Analyze this {document_type} document and provide:
	1. Document authenticity assessment
	2. Any potential issues or inconsistencies
	3. Additional information not captured by OCR
	4. Recommendations for document improvement if needed

	OCR extracted data: {json.dumps(ocr_data, indent=2)}
	"""

	# Get Gemini's analysis
	response = self.gemini_model.generate_content([prompt, image_bytes])

	return {
	"analysis": response.text,
	"confidence_score": response.candidates[0].score if hasattr(response, 'candidates') else None
	}

	def _validate_against_process(
	self,
	extracted_data: Dict,
	document_type: str,
	process: HealthcareProcess
	) -> Dict:
	"""Validate document against process requirements."""
	requirements = get_process_requirements(process)
	document_requirement = next(
	(req for req in requirements if req.document_type.value == document_type),
	None
	)

	if not document_requirement:
	return {
	"is_valid": False,
	"message": f"Document type {document_type} is not required for this process"
	}

	# Check required fields
	missing_fields = []
	for field, rule in document_requirement.validation_rules.items():
	if rule == "required" and field not in extracted_data:
	missing_fields.append(field)

	return {
	"is_valid": len(missing_fields) == 0,
	"missing_fields": missing_fields,
	"document_requirement": document_requirement.description
	}

	def get_process_requirements(self, process: HealthcareProcess) -> Dict:
	"""Get document requirements for a specific process."""
	requirements = get_process_requirements(process)
	return {
	"required_documents": [
	{
	"type": req.document_type.value,
	"description": req.description,
	"validation_rules": req.validation_rules
	}
	for req in requirements if req.is_required
	],
	"optional_documents": [
	{
	"type": req.document_type.value,
	"description": req.description,
	"validation_rules": req.validation_rules
	}
	for req in requirements if not req.is_required
	]
	}

	def _parse_structured_document(self, ocr_result: List, document_type: str) -> Dict:
	"""Parse results from structured documents like KTP, KK, BPJS."""
	extracted_data = {}

	if document_type == 'KTP':
	# Define KTP field mappings
	field_mappings = {
	'NIK': r'NIK',
	'Nama': r'Nama',
	'Tempat/Tgl Lahir': r'Tempat/Tgl Lahir',
	'Alamat': r'Alamat',
	'RT/RW': r'RT/RW',
	'Kel/Desa': r'Kel/Desa',
	'Kecamatan': r'Kecamatan',
	'Agama': r'Agama',
	'Status Perkawinan': r'Status Perkawinan',
	'Pekerjaan': r'Pekerjaan',
	'Kewarganegaraan': r'Kewarganegaraan'
	}
	# Process OCR results and map to fields
	for line in ocr_result:
	text = line[1][0] # Get the text
	for field, pattern in field_mappings.items():
	if pattern.lower() in text.lower():
	# Extract the value after the field name
	value = text.split(pattern)[-1].strip()
	extracted_data[field] = value

	elif document_type == 'BPJS':
	# Define BPJS field mappings
	field_mappings = {
	'Nomor Kartu': r'Nomor Kartu',
	'Nama Peserta': r'Nama Peserta',
	'No. KTP': r'No. KTP',
	'Faskes Tingkat 1': r'Faskes Tingkat 1'
	}
	# Process OCR results and map to fields
	for line in ocr_result:
	text = line[1][0] # Get the text
	for field, pattern in field_mappings.items():
	if pattern.lower() in text.lower():
	value = text.split(pattern)[-1].strip()
	extracted_data[field] = value

	return extracted_data

	def _parse_unstructured_document(self, ocr_result: List, document_type: str) -> Dict:
	"""Parse results from unstructured documents like medical bills, prescriptions."""
	extracted_data = {}
	# Combine all text for Gemini analysis
	full_text = " ".join([text for _, text, _ in ocr_result])
	extracted_data["full_text"] = full_text
	return extracted_data