ai-certificate / tests /test_ocr.py
jo8780's picture
Initial Space deploy: AI Certificate Analyzer
17f1739
import pytest
import numpy as np
import cv2
from pathlib import Path
import sys
sys.path.append(str(Path(__file__).parent.parent))
from app.analyzers.ocr.english_ocr import EnglishOCREngine
from app.analyzers.ocr.amharic_ocr import AmharicOCREngine
from app.analyzers.ocr.multilingual_ocr import MultilingualOCREngine
class TestOCREngines:
"""Production tests for OCR engines"""
@pytest.fixture
def english_engine(self):
return EnglishOCREngine()
@pytest.fixture
def amharic_engine(self):
return AmharicOCREngine()
@pytest.fixture
def multilingual_engine(self):
return MultilingualOCREngine()
def create_test_image(self, text: str, lang: str = 'eng'):
"""Create test image with text"""
if lang == 'amh':
# Amharic text might need special handling
text = "ሙከራ ጽሑፍ" if not text else text
img = np.ones((200, 400, 3), dtype=np.uint8) * 255
# Add text
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.7
thickness = 2
# Calculate text size for centering
text_size = cv2.getTextSize(text, font, font_scale, thickness)[0]
text_x = (img.shape[1] - text_size[0]) // 2
text_y = (img.shape[0] + text_size[1]) // 2
cv2.putText(img, text, (text_x, text_y), font, font_scale, (0, 0, 0), thickness)
return img
@pytest.mark.asyncio
async def test_english_ocr_basic(self, english_engine):
"""Test basic English OCR"""
test_text = "CERTIFICATE OF COMPLETION"
test_image = self.create_test_image(test_text, 'eng')
result = await english_engine.extract(test_image)
assert result['success'] is True
assert 'extracted_text' in result
assert 'ocr_confidence' in result
assert result['ocr_confidence'] > 0.1
assert result['language'] == 'eng'
@pytest.mark.asyncio
async def test_english_field_extraction(self, english_engine):
"""Test field extraction from English text"""
# Create image with certificate-like text
text_lines = [
"Name: John Michael Doe",
"Student ID: STU2023-456",
"University: Oxford University",
"Course: Computer Science",
"GPA: 3.75",
"Issue Date: 2023-06-15"
]
test_image = np.ones((400, 600, 3), dtype=np.uint8) * 255
font = cv2.FONT_HERSHEY_SIMPLEX
for i, line in enumerate(text_lines):
y_pos = 50 + i * 50
cv2.putText(test_image, line, (50, y_pos), font, 0.6, (0, 0, 0), 2)
result = await english_engine.extract(test_image)
assert result['success']
extracted = result['extracted_text']
# Should extract at least some fields
assert len(extracted) >= 2
# Check field confidence
field_conf = result['field_confidence']
for field, conf in field_conf.items():
assert 0 <= conf <= 1
@pytest.mark.asyncio
async def test_amharic_ocr_availability(self, amharic_engine):
"""Test Amharic OCR availability"""
# Just test that engine initializes
assert amharic_engine is not None
assert hasattr(amharic_engine, 'amharic_available')
# Test might fail if Amharic Tesseract not installed
# That's okay - it's a valid test outcome
if not amharic_engine.amharic_available:
pytest.skip("Amharic OCR not available")
@pytest.mark.asyncio
async def test_multilingual_detection(self, multilingual_engine):
"""Test multilingual language detection"""
# Create English test image
eng_image = self.create_test_image("Certificate Test", 'eng')
result = await multilingual_engine.extract(eng_image)
assert result['success']
assert 'language' in result or 'engine_used' in result
# Should use English engine for English text
if 'engine_used' in result:
assert 'english' in result['engine_used'].lower()
@pytest.mark.asyncio
async def test_ocr_error_handling(self, english_engine):
"""Test OCR error handling"""
# Test with empty/black image
empty_image = np.zeros((100, 100, 3), dtype=np.uint8)
result = await english_engine.extract(empty_image)
# Should handle gracefully
assert 'success' in result
# Might be False or True with low confidence
if not result['success']:
assert 'error' in result
else:
assert result['ocr_confidence'] < 0.5
@pytest.mark.asyncio
async def test_batch_ocr_processing(self, multilingual_engine):
"""Test batch OCR processing"""
# Create multiple test images
images = []
for i in range(3):
text = f"Test Certificate {i+1}"
images.append(self.create_test_image(text, 'eng'))
results = await multilingual_engine.batch_extract(images)
assert len(results) == 3
for result in results:
assert 'success' in result
assert 'processing_time' in result or 'timestamp' in result
def test_ocr_configurations(self, english_engine):
"""Test OCR configuration options"""
assert hasattr(english_engine, 'configs')
assert isinstance(english_engine.configs, dict)
# Should have multiple configurations for different scenarios
assert len(english_engine.configs) >= 3
assert 'document' in english_engine.configs
assert 'single_line' in english_engine.configs
# Check field patterns
assert hasattr(english_engine, 'field_patterns')
assert isinstance(english_engine.field_patterns, dict)
assert 'name' in english_engine.field_patterns
assert 'student_id' in english_engine.field_patterns
@pytest.mark.asyncio
async def test_confidence_calculation(self, english_engine):
"""Test OCR confidence calculation"""
# Good quality text
good_image = self.create_test_image("Clear Text for OCR", 'eng')
good_result = await english_engine.extract(good_image)
# Poor quality (blurry) text
poor_image = cv2.GaussianBlur(good_image, (9, 9), 5)
poor_result = await english_engine.extract(poor_image)
# Good result should have higher confidence
if good_result['success'] and poor_result['success']:
assert good_result['ocr_confidence'] > poor_result['ocr_confidence']
@pytest.mark.asyncio
async def test_date_validation(self, english_engine):
"""Test date field validation"""
# Test valid dates
valid_dates = ["2023-12-31", "31/12/2023", "12/31/2023"]
for date_str in valid_dates:
assert english_engine._validate_date(date_str)
# Test invalid dates
invalid_dates = ["", "not-a-date", "2023-13-45", "31/13/2023"]
for date_str in invalid_dates:
assert not english_engine._validate_date(date_str)
@pytest.mark.asyncio
async def test_performance_benchmark(self, english_engine):
"""Benchmark OCR performance"""
import time
# Create test image
test_image = self.create_test_image("Performance Test Text", 'eng')
# Time the extraction
start_time = time.time()
result = await english_engine.extract(test_image)
end_time = time.time()
processing_time = end_time - start_time
# Should complete within reasonable time
assert processing_time < 10.0 # 10 seconds max
# Result should include processing time
if 'processing_time' in result:
assert abs(result['processing_time'] - processing_time) < 1.0
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short"])