""" اختبارات التكامل (Integration Tests) ======================================== اختبار تفاعل الوحدات المختلفة مع بعضها. يشمل اختبارات الوحدات القديمة + اختبارات المكونات الجديدة: - Surya OCR - التطبيع (normalize) - كشف الجداول (table detection) - معالجة اللغات المختلطة - التصدير من JSON القياسي """ import json import os import pytest import tempfile from unittest.mock import MagicMock, patch class TestOCRToNLPIntegration: """اختبار تسلسل OCR -> NLP.""" def test_ocr_engine_initialization(self): """اختبار تهيئة محرك OCR.""" from modules.vision.ocr_engine import OCREngine engine = OCREngine( enable_trocr=False, enable_easyocr=False, enable_tesseract=False, enable_surya=False, enable_paddleocr=False, ) assert engine is not None assert len(engine.get_available_engines()) == 0 def test_spell_corrector_initialization(self): """اختبار تهيئة المصحح الإملائي.""" from modules.nlp.spell_corrector import SpellCorrector corrector = SpellCorrector() assert corrector is not None assert "en" in corrector.supported_languages assert "ar" in corrector.supported_languages assert "de" in corrector.supported_languages def test_spell_corrector_protected_terms(self): """اختبار حماية المصطلحات التقنية.""" from modules.nlp.spell_corrector import SpellCorrector corrector = SpellCorrector() # كلمات بايثون محجوزة assert corrector.correct_word("print") == "print" assert corrector.correct_word("numpy") == "numpy" assert corrector.correct_word("async") == "async" # أرقام assert corrector.correct_word("123") == "123" # مقاطع كود assert corrector.correct_word("my_variable") == "my_variable" def test_spell_corrector_english(self): """اختبار التصحيح الإنجليزي.""" from modules.nlp.spell_corrector import SpellCorrector corrector = SpellCorrector() result = corrector.correct_text("helloo world") assert result["corrected_text"] is not None assert isinstance(result["total_corrections"], int) def test_spell_corrector_batch(self): """اختبار التصحيح المتوازي.""" from modules.nlp.spell_corrector import SpellCorrector corrector = SpellCorrector() texts = ["helloo world", "testt text", "samplee data"] results = corrector.correct_batch(texts) assert len(results) == 3 for result in results: assert "corrected_text" in result def test_ocr_engine_availability(self): """اختبار توفر المحركات.""" from modules.vision.ocr_engine import OCREngine engine = OCREngine() engines = engine.get_available_engines() assert isinstance(engines, list) for e in engines: assert "name" in e assert "available" in e assert "enabled" in e def test_ocr_engine_includes_surya(self): """اختبار تضمين Surya في قائمة المحركات.""" from modules.vision.ocr_engine import OCREngine engine = OCREngine() engines = engine.get_available_engines() engine_names = [e["name"] for e in engines] assert "Surya" in engine_names class TestModuleImports: """اختبار استيراد جميع الوحدات.""" def test_import_vision_modules(self): """اختبار استيراد وحدة الرؤية.""" from modules.vision import ocr_engine, image_preprocessor, text_reconstructor, pdf_processor assert ocr_engine is not None assert image_preprocessor is not None def test_import_nlp_modules(self): """اختبار استيراد وحدة NLP.""" from modules.nlp import spell_corrector, translator, summarizer, language_detector assert spell_corrector is not None def test_import_evaluation(self): """اختبار استيراد وحدة التقييم.""" from modules.evaluation import metrics assert metrics is not None def test_import_core_structure(self): """اختبار استيراد النماذج الأساسية.""" from modules.core.structure import BBox, OCRToken, DocumentPage, Document assert BBox is not None assert OCRToken is not None def test_import_export(self): """اختبار استيراد وحدة التصدير.""" from modules.export import exporter assert exporter is not None def test_import_security(self): """اختبار استيراد وحدة الأمان.""" from modules.security import secure_file_handler, sensitive_data_scanner assert secure_file_handler is not None def test_import_rtl(self): """اختبار استيراد معالجة RTL.""" from modules.nlp import arabic_rtl assert arabic_rtl is not None def test_import_new_normalize(self): """اختبار استيراد وحدة التطبيع الجديدة.""" from modules.vision.normalize import normalize_ocr_output assert normalize_ocr_output is not None def test_import_new_surya(self): """اختبار استيراد محرك Surya — يتوقع ImportError إذا لم يُثبّت.""" try: from modules.vision.surya_ocr import SuryaOCREngine assert SuryaOCREngine is not None except ImportError: pass # Surya غير مثبّت — سلوك متوقع def test_import_new_table_detection(self): """اختبار استيراد كاشف الجداول.""" from modules.vision.table_detection import TableDetectionTransformer assert TableDetectionTransformer is not None def test_import_new_mixed_language(self): """اختبار استيراد معالج اللغات المختلطة.""" from modules.nlp.mixed_language import MixedLanguageHandler assert MixedLanguageHandler is not None class TestConfigIntegration: """اختبار تكامل الإعدادات.""" def test_config_defaults(self): """اختبار الإعدادات الافتراضية.""" from config import OmniFileConfig cfg = OmniFileConfig() assert cfg.enable_trocr is True assert cfg.enable_easyocr is True assert cfg.enable_tesseract is True assert "en" in cfg.supported_languages assert "ar" in cfg.supported_languages assert "de" in cfg.supported_languages def test_config_save_load(self, tmp_path): """اختبار حفظ وتحميل الإعدادات.""" from config import OmniFileConfig cfg = OmniFileConfig(enable_paddleocr=True, fusion_strategy="voting") config_path = str(tmp_path / "test_config.json") cfg.save(config_path) loaded = OmniFileConfig.load(config_path) assert loaded.enable_paddleocr is True assert loaded.fusion_strategy == "voting" class TestResultFusion: """اختبار دمج النتائج.""" def test_fusion_empty_results(self): """اختبار دمج نتائج فارغة.""" from modules.vision.result_fusion import ResultFusion fusion = ResultFusion() result = fusion.fuse_page_results([]) assert result is not None def test_fusion_single_result(self): """اختبار دمج نتيجة واحدة.""" from modules.vision.result_fusion import ResultFusion, LineResult, BoundingBox, PageResult fusion = ResultFusion() line = LineResult( text="test text", confidence=0.9, bbox=BoundingBox(x=0, y=0, width=100, height=30), words=[], block_type="paragraph", source_engine="easyocr" ) page = PageResult(lines=[line]) result = fusion.fuse_page_results([page]) assert result is not None class TestMetricsIntegration: """اختبار تكامل مقاييس الأداء.""" def test_cer_perfect_match(self): """اختبار CER مع تطابق مثالي.""" from modules.evaluation.metrics import calculate_cer cer = calculate_cer("hello world", "hello world") assert cer == 0.0 def test_wer_perfect_match(self): """اختبار WER مع تطابق مثالي.""" from modules.evaluation.metrics import calculate_wer wer = calculate_wer("hello world", "hello world") assert wer == 0.0 def test_arabic_normalization(self): """اختبار تطبيع النص العربي.""" from modules.evaluation.metrics import _normalize_arabic # إزالة التشكيل normalized = _normalize_arabic("بِسْمِ اللهِ الرَّحْمٰنِ الرَّحِيمِ") assert "بسم" in normalized # توحيد الألف normalized = _normalize_arabic("أحمد إبراهيم") assert "ا" in normalized class TestNormalizeOCR: """اختبار وحدة التطبيع الجديدة.""" def test_normalize_basic(self): """اختبار التطبيع الأساسي.""" from modules.vision.normalize import normalize_ocr_output raw_blocks = [ { "type": "paragraph", "bbox": [0.1, 0.2, 0.9, 0.3], "text": "مرحبا بالعالم", "confidence": 0.95, }, { "type": "paragraph", "bbox": [0.1, 0.4, 0.9, 0.5], "text": "Hello World", "confidence": 0.90, }, ] result = normalize_ocr_output( raw_blocks, "test.jpg", 2480, 3508, "tesseract", ["ar", "en"] ) assert "metadata" in result assert "pages" in result assert len(result["pages"]) == 1 assert len(result["pages"][0]["blocks"]) == 2 assert result["metadata"]["engine"] == "tesseract" assert result["pages"][0]["width"] == 2480 assert result["pages"][0]["blocks"][0]["id"] == "block_1" def test_normalize_table(self): """اختبار تطبيع كتل الجداول.""" from modules.vision.normalize import normalize_ocr_output raw_blocks = [ { "type": "table", "bbox": [0.1, 0.1, 0.9, 0.5], "confidence": 0.85, "cells": [ ["اسم", "العمر"], ["أحمد", "25"], ["سارة", "30"], ], } ] result = normalize_ocr_output( raw_blocks, "test.jpg", 2480, 3508, "surya", ["ar"] ) block = result["pages"][0]["blocks"][0] assert block["type"] == "table" assert "structure" in block assert block["structure"]["rows"] == 3 assert block["structure"]["cols"] == 2 assert len(block["structure"]["cells"]) == 6 def test_normalize_image_with_caption(self): """اختبار تطبيع صور مع تسمية.""" from modules.vision.normalize import normalize_ocr_output raw_blocks = [ { "type": "image", "bbox": [0.1, 0.1, 0.9, 0.5], "image_file": "figure1.png", "caption": { "text": "شكل 1: مخطط النظام", "bbox": [0.2, 0.52, 0.8, 0.56], }, } ] result = normalize_ocr_output( raw_blocks, "test.jpg", 2480, 3508, "easyocr", ["ar"] ) block = result["pages"][0]["blocks"][0] assert block["type"] == "image" assert block["image_file"] == "figure1.png" assert "caption" in block assert block["caption"]["text"] == "شكل 1: مخطط النظام" def test_normalize_save_and_load(self, tmp_path): """اختبار حفظ وتحميل JSON الموحد.""" from modules.vision.normalize import ( normalize_ocr_output, save_normalized, load_normalized, ) raw_blocks = [ {"type": "paragraph", "bbox": [0, 0, 1, 1], "text": "test", "confidence": 0.9} ] result = normalize_ocr_output( raw_blocks, "test.jpg", 100, 100, "tesseract", ["en"] ) json_path = str(tmp_path / "result.json") save_normalized(result, json_path) loaded = load_normalized(json_path) assert loaded["metadata"]["engine"] == "tesseract" assert len(loaded["pages"]) == 1 def test_merge_pages(self): """اختبار دمج نتائج متعددة.""" from modules.vision.normalize import normalize_ocr_output, merge_pages blocks1 = [{"type": "paragraph", "bbox": [0, 0, 1, 0.5], "text": "صفحة 1", "confidence": 0.9}] blocks2 = [{"type": "paragraph", "bbox": [0, 0, 1, 0.5], "text": "صفحة 2", "confidence": 0.9}] result1 = normalize_ocr_output(blocks1, "p1.jpg", 100, 100, "tesseract", ["ar"]) result2 = normalize_ocr_output(blocks2, "p2.jpg", 100, 100, "tesseract", ["ar"]) merged = merge_pages([result1, result2]) assert merged["metadata"]["page_count"] == 2 assert merged["pages"][0]["page_index"] == 0 assert merged["pages"][1]["page_index"] == 1 class TestMixedLanguageHandler: """اختبار معالج اللغات المختلطة.""" def test_detect_language_arabic(self): """اختبار كشف اللغة العربية.""" from modules.nlp.mixed_language import MixedLanguageHandler handler = MixedLanguageHandler() assert handler.detect_language("مرحبا بالعالم") == "ar" def test_detect_language_english(self): """اختبار كشف اللغة الإنجليزية.""" from modules.nlp.mixed_language import MixedLanguageHandler handler = MixedLanguageHandler() assert handler.detect_language("Hello World") == "en" def test_detect_language_empty(self): """اختبار كشف لغة نص فارغ.""" from modules.nlp.mixed_language import MixedLanguageHandler handler = MixedLanguageHandler() assert handler.detect_language("") == "ar" def test_split_by_language(self): """اختبار تقسيم النص حسب اللغة.""" from modules.nlp.mixed_language import MixedLanguageHandler handler = MixedLanguageHandler() segments = handler.split_by_language("مرحبا Hello") assert len(segments) >= 2 # التحقق من وجود لغتين مختلفتين langs = [s[0] for s in segments] assert "ar" in langs assert "en" in langs def test_correct_arabic(self): """اختبار التصحيح العربي.""" from modules.nlp.mixed_language import MixedLanguageHandler handler = MixedLanguageHandler() # الحياة موجودة في القاموس result = handler.correct_text_mixed("الحياه") assert "الحياة" in result def test_get_ocr_language_params(self): """اختبار استخراج معلمات اللغات.""" from modules.nlp.mixed_language import MixedLanguageHandler handler = MixedLanguageHandler() langs = handler.get_ocr_language_params("مرحبا Hello world") assert "ar" in langs assert "en" in langs class TestLayoutExport: """اختبار التصدير المطابق للتنسيق.""" def test_export_to_docx_basic(self, tmp_path): """اختبار تصدير DOCX أساسي.""" from modules.export.layout_preserving import export_to_docx layout_data = { "blocks": [ {"type": "paragraph", "text": "مرحبا بالعالم", "bbox": [0, 0, 1, 1]}, {"type": "header", "text": "عنوان", "bbox": [0, 0, 1, 1]}, ] } output_path = str(tmp_path / "test.docx") result = export_to_docx(layout_data, output_path) assert os.path.exists(result) def test_layout_to_docx_from_json(self, tmp_path): """اختبار التصدير من JSON القياسي.""" from modules.export.layout_preserving import layout_to_docx from modules.vision.normalize import normalize_ocr_output, save_normalized raw_blocks = [ {"type": "paragraph", "bbox": [0.1, 0.1, 0.9, 0.2], "text": "مرحبا", "confidence": 0.95}, {"type": "header", "bbox": [0.1, 0.0, 0.9, 0.1], "text": "عنوان", "confidence": 0.9}, ] normalized = normalize_ocr_output( raw_blocks, "test.jpg", 2480, 3508, "tesseract", ["ar"] ) json_path = str(tmp_path / "result.json") save_normalized(normalized, json_path) docx_path = str(tmp_path / "output.docx") result = layout_to_docx(json_path, docx_path) assert os.path.exists(result) def test_layout_to_docx_with_table(self, tmp_path): """اختبار التصدير مع جدول.""" from modules.export.layout_preserving import layout_to_docx from modules.vision.normalize import normalize_ocr_output, save_normalized raw_blocks = [ { "type": "table", "bbox": [0.1, 0.1, 0.9, 0.5], "confidence": 0.85, "cells": [ ["اسم", "القيمة"], ["أ", "100"], ["ب", "200"], ], } ] normalized = normalize_ocr_output( raw_blocks, "test.jpg", 2480, 3508, "surya", ["ar"] ) json_path = str(tmp_path / "table_result.json") save_normalized(normalized, json_path) docx_path = str(tmp_path / "table_output.docx") result = layout_to_docx(json_path, docx_path) assert os.path.exists(result) def test_ocr_result_to_layout(self): """اختبار تحويل نتيجة OCR إلى layout.""" from modules.export.layout_preserving import ocr_result_to_layout ocr_json = { "blocks": [ {"type": "paragraph", "bbox": [0, 0, 1, 1], "text": "test"}, {"type": "table", "bbox": [0, 0, 1, 1], "cells": [["a", "b"]]}, ] } layout = ocr_result_to_layout(ocr_json, "img.jpg") assert "blocks" in layout assert len(layout["blocks"]) == 2 class TestTableDetection: """اختبار كاشف الجداول.""" def test_table_detection_init(self): """اختبار تهيئة كاشف الجداول.""" from modules.vision.table_detection import TableDetectionTransformer detector = TableDetectionTransformer(device="cpu") assert detector is not None assert detector.device == "cpu" def test_table_detection_without_model(self): """اختبار كشف الجداول بدون تحميل نموذج (يُرجع قائمة فارغة).""" from modules.vision.table_detection import TableDetectionTransformer detector = TableDetectionTransformer(device="cpu") # بدون تحميل النموذج فعلياً، detect_tables ستسجل تحذيراً وترجع [] tables = detector.detect_tables("nonexistent.jpg", threshold=0.5) assert isinstance(tables, list)