Spaces:
Sleeping
Sleeping
| """ | |
| اختبارات التكامل (Integration Tests) | |
| ======================================== | |
| اختبار تفاعل الوحدات المختلفة مع بعضها. | |
| يشمل اختبارات الوحدات القديمة + اختبارات المكونات الجديدة: | |
| - Surya OCR | |
| - التطبيع (normalize) | |
| - كشف الجداول (table detection) | |
| - معالجة اللغات المختلطة | |
| - التصدير من JSON القياسي | |
| """ | |
| import json | |
| import os | |
| import pytest | |
| import tempfile | |
| from unittest.mock import MagicMock, patch | |
| class TestOCRToNLPIntegration: | |
| """اختبار تسلسل OCR -> NLP.""" | |
| def test_ocr_engine_initialization(self): | |
| """اختبار تهيئة محرك OCR.""" | |
| from modules.vision.ocr_engine import OCREngine | |
| engine = OCREngine( | |
| enable_trocr=False, | |
| enable_easyocr=False, | |
| enable_tesseract=False, | |
| enable_surya=False, | |
| enable_paddleocr=False, | |
| ) | |
| assert engine is not None | |
| assert len(engine.get_available_engines()) == 0 | |
| def test_spell_corrector_initialization(self): | |
| """اختبار تهيئة المصحح الإملائي.""" | |
| from modules.nlp.spell_corrector import SpellCorrector | |
| corrector = SpellCorrector() | |
| assert corrector is not None | |
| assert "en" in corrector.supported_languages | |
| assert "ar" in corrector.supported_languages | |
| assert "de" in corrector.supported_languages | |
| def test_spell_corrector_protected_terms(self): | |
| """اختبار حماية المصطلحات التقنية.""" | |
| from modules.nlp.spell_corrector import SpellCorrector | |
| corrector = SpellCorrector() | |
| # كلمات بايثون محجوزة | |
| assert corrector.correct_word("print") == "print" | |
| assert corrector.correct_word("numpy") == "numpy" | |
| assert corrector.correct_word("async") == "async" | |
| # أرقام | |
| assert corrector.correct_word("123") == "123" | |
| # مقاطع كود | |
| assert corrector.correct_word("my_variable") == "my_variable" | |
| def test_spell_corrector_english(self): | |
| """اختبار التصحيح الإنجليزي.""" | |
| from modules.nlp.spell_corrector import SpellCorrector | |
| corrector = SpellCorrector() | |
| result = corrector.correct_text("helloo world") | |
| assert result["corrected_text"] is not None | |
| assert isinstance(result["total_corrections"], int) | |
| def test_spell_corrector_batch(self): | |
| """اختبار التصحيح المتوازي.""" | |
| from modules.nlp.spell_corrector import SpellCorrector | |
| corrector = SpellCorrector() | |
| texts = ["helloo world", "testt text", "samplee data"] | |
| results = corrector.correct_batch(texts) | |
| assert len(results) == 3 | |
| for result in results: | |
| assert "corrected_text" in result | |
| def test_ocr_engine_availability(self): | |
| """اختبار توفر المحركات.""" | |
| from modules.vision.ocr_engine import OCREngine | |
| engine = OCREngine() | |
| engines = engine.get_available_engines() | |
| assert isinstance(engines, list) | |
| for e in engines: | |
| assert "name" in e | |
| assert "available" in e | |
| assert "enabled" in e | |
| def test_ocr_engine_includes_surya(self): | |
| """اختبار تضمين Surya في قائمة المحركات.""" | |
| from modules.vision.ocr_engine import OCREngine | |
| engine = OCREngine() | |
| engines = engine.get_available_engines() | |
| engine_names = [e["name"] for e in engines] | |
| assert "Surya" in engine_names | |
| class TestModuleImports: | |
| """اختبار استيراد جميع الوحدات.""" | |
| def test_import_vision_modules(self): | |
| """اختبار استيراد وحدة الرؤية.""" | |
| from modules.vision import ocr_engine, image_preprocessor, text_reconstructor, pdf_processor | |
| assert ocr_engine is not None | |
| assert image_preprocessor is not None | |
| def test_import_nlp_modules(self): | |
| """اختبار استيراد وحدة NLP.""" | |
| from modules.nlp import spell_corrector, translator, summarizer, language_detector | |
| assert spell_corrector is not None | |
| def test_import_evaluation(self): | |
| """اختبار استيراد وحدة التقييم.""" | |
| from modules.evaluation import metrics | |
| assert metrics is not None | |
| def test_import_core_structure(self): | |
| """اختبار استيراد النماذج الأساسية.""" | |
| from modules.core.structure import BBox, OCRToken, DocumentPage, Document | |
| assert BBox is not None | |
| assert OCRToken is not None | |
| def test_import_export(self): | |
| """اختبار استيراد وحدة التصدير.""" | |
| from modules.export import exporter | |
| assert exporter is not None | |
| def test_import_security(self): | |
| """اختبار استيراد وحدة الأمان.""" | |
| from modules.security import secure_file_handler, sensitive_data_scanner | |
| assert secure_file_handler is not None | |
| def test_import_rtl(self): | |
| """اختبار استيراد معالجة RTL.""" | |
| from modules.nlp import arabic_rtl | |
| assert arabic_rtl is not None | |
| def test_import_new_normalize(self): | |
| """اختبار استيراد وحدة التطبيع الجديدة.""" | |
| from modules.vision.normalize import normalize_ocr_output | |
| assert normalize_ocr_output is not None | |
| def test_import_new_surya(self): | |
| """اختبار استيراد محرك Surya — يتوقع ImportError إذا لم يُثبّت.""" | |
| try: | |
| from modules.vision.surya_ocr import SuryaOCREngine | |
| assert SuryaOCREngine is not None | |
| except ImportError: | |
| pass # Surya غير مثبّت — سلوك متوقع | |
| def test_import_new_table_detection(self): | |
| """اختبار استيراد كاشف الجداول.""" | |
| from modules.vision.table_detection import TableDetectionTransformer | |
| assert TableDetectionTransformer is not None | |
| def test_import_new_mixed_language(self): | |
| """اختبار استيراد معالج اللغات المختلطة.""" | |
| from modules.nlp.mixed_language import MixedLanguageHandler | |
| assert MixedLanguageHandler is not None | |
| class TestConfigIntegration: | |
| """اختبار تكامل الإعدادات.""" | |
| def test_config_defaults(self): | |
| """اختبار الإعدادات الافتراضية.""" | |
| from config import OmniFileConfig | |
| cfg = OmniFileConfig() | |
| assert cfg.enable_trocr is True | |
| assert cfg.enable_easyocr is True | |
| assert cfg.enable_tesseract is True | |
| assert "en" in cfg.supported_languages | |
| assert "ar" in cfg.supported_languages | |
| assert "de" in cfg.supported_languages | |
| def test_config_save_load(self, tmp_path): | |
| """اختبار حفظ وتحميل الإعدادات.""" | |
| from config import OmniFileConfig | |
| cfg = OmniFileConfig(enable_paddleocr=True, fusion_strategy="voting") | |
| config_path = str(tmp_path / "test_config.json") | |
| cfg.save(config_path) | |
| loaded = OmniFileConfig.load(config_path) | |
| assert loaded.enable_paddleocr is True | |
| assert loaded.fusion_strategy == "voting" | |
| class TestResultFusion: | |
| """اختبار دمج النتائج.""" | |
| def test_fusion_empty_results(self): | |
| """اختبار دمج نتائج فارغة.""" | |
| from modules.vision.result_fusion import ResultFusion | |
| fusion = ResultFusion() | |
| result = fusion.fuse_page_results([]) | |
| assert result is not None | |
| def test_fusion_single_result(self): | |
| """اختبار دمج نتيجة واحدة.""" | |
| from modules.vision.result_fusion import ResultFusion, LineResult, BoundingBox, PageResult | |
| fusion = ResultFusion() | |
| line = LineResult( | |
| text="test text", | |
| confidence=0.9, | |
| bbox=BoundingBox(x=0, y=0, width=100, height=30), | |
| words=[], | |
| block_type="paragraph", | |
| source_engine="easyocr" | |
| ) | |
| page = PageResult(lines=[line]) | |
| result = fusion.fuse_page_results([page]) | |
| assert result is not None | |
| class TestMetricsIntegration: | |
| """اختبار تكامل مقاييس الأداء.""" | |
| def test_cer_perfect_match(self): | |
| """اختبار CER مع تطابق مثالي.""" | |
| from modules.evaluation.metrics import calculate_cer | |
| cer = calculate_cer("hello world", "hello world") | |
| assert cer == 0.0 | |
| def test_wer_perfect_match(self): | |
| """اختبار WER مع تطابق مثالي.""" | |
| from modules.evaluation.metrics import calculate_wer | |
| wer = calculate_wer("hello world", "hello world") | |
| assert wer == 0.0 | |
| def test_arabic_normalization(self): | |
| """اختبار تطبيع النص العربي.""" | |
| from modules.evaluation.metrics import _normalize_arabic | |
| # إزالة التشكيل | |
| normalized = _normalize_arabic("بِسْمِ اللهِ الرَّحْمٰنِ الرَّحِيمِ") | |
| assert "بسم" in normalized | |
| # توحيد الألف | |
| normalized = _normalize_arabic("أحمد إبراهيم") | |
| assert "ا" in normalized | |
| class TestNormalizeOCR: | |
| """اختبار وحدة التطبيع الجديدة.""" | |
| def test_normalize_basic(self): | |
| """اختبار التطبيع الأساسي.""" | |
| from modules.vision.normalize import normalize_ocr_output | |
| raw_blocks = [ | |
| { | |
| "type": "paragraph", | |
| "bbox": [0.1, 0.2, 0.9, 0.3], | |
| "text": "مرحبا بالعالم", | |
| "confidence": 0.95, | |
| }, | |
| { | |
| "type": "paragraph", | |
| "bbox": [0.1, 0.4, 0.9, 0.5], | |
| "text": "Hello World", | |
| "confidence": 0.90, | |
| }, | |
| ] | |
| result = normalize_ocr_output( | |
| raw_blocks, "test.jpg", 2480, 3508, "tesseract", ["ar", "en"] | |
| ) | |
| assert "metadata" in result | |
| assert "pages" in result | |
| assert len(result["pages"]) == 1 | |
| assert len(result["pages"][0]["blocks"]) == 2 | |
| assert result["metadata"]["engine"] == "tesseract" | |
| assert result["pages"][0]["width"] == 2480 | |
| assert result["pages"][0]["blocks"][0]["id"] == "block_1" | |
| def test_normalize_table(self): | |
| """اختبار تطبيع كتل الجداول.""" | |
| from modules.vision.normalize import normalize_ocr_output | |
| raw_blocks = [ | |
| { | |
| "type": "table", | |
| "bbox": [0.1, 0.1, 0.9, 0.5], | |
| "confidence": 0.85, | |
| "cells": [ | |
| ["اسم", "العمر"], | |
| ["أحمد", "25"], | |
| ["سارة", "30"], | |
| ], | |
| } | |
| ] | |
| result = normalize_ocr_output( | |
| raw_blocks, "test.jpg", 2480, 3508, "surya", ["ar"] | |
| ) | |
| block = result["pages"][0]["blocks"][0] | |
| assert block["type"] == "table" | |
| assert "structure" in block | |
| assert block["structure"]["rows"] == 3 | |
| assert block["structure"]["cols"] == 2 | |
| assert len(block["structure"]["cells"]) == 6 | |
| def test_normalize_image_with_caption(self): | |
| """اختبار تطبيع صور مع تسمية.""" | |
| from modules.vision.normalize import normalize_ocr_output | |
| raw_blocks = [ | |
| { | |
| "type": "image", | |
| "bbox": [0.1, 0.1, 0.9, 0.5], | |
| "image_file": "figure1.png", | |
| "caption": { | |
| "text": "شكل 1: مخطط النظام", | |
| "bbox": [0.2, 0.52, 0.8, 0.56], | |
| }, | |
| } | |
| ] | |
| result = normalize_ocr_output( | |
| raw_blocks, "test.jpg", 2480, 3508, "easyocr", ["ar"] | |
| ) | |
| block = result["pages"][0]["blocks"][0] | |
| assert block["type"] == "image" | |
| assert block["image_file"] == "figure1.png" | |
| assert "caption" in block | |
| assert block["caption"]["text"] == "شكل 1: مخطط النظام" | |
| def test_normalize_save_and_load(self, tmp_path): | |
| """اختبار حفظ وتحميل JSON الموحد.""" | |
| from modules.vision.normalize import ( | |
| normalize_ocr_output, | |
| save_normalized, | |
| load_normalized, | |
| ) | |
| raw_blocks = [ | |
| {"type": "paragraph", "bbox": [0, 0, 1, 1], "text": "test", "confidence": 0.9} | |
| ] | |
| result = normalize_ocr_output( | |
| raw_blocks, "test.jpg", 100, 100, "tesseract", ["en"] | |
| ) | |
| json_path = str(tmp_path / "result.json") | |
| save_normalized(result, json_path) | |
| loaded = load_normalized(json_path) | |
| assert loaded["metadata"]["engine"] == "tesseract" | |
| assert len(loaded["pages"]) == 1 | |
| def test_merge_pages(self): | |
| """اختبار دمج نتائج متعددة.""" | |
| from modules.vision.normalize import normalize_ocr_output, merge_pages | |
| blocks1 = [{"type": "paragraph", "bbox": [0, 0, 1, 0.5], "text": "صفحة 1", "confidence": 0.9}] | |
| blocks2 = [{"type": "paragraph", "bbox": [0, 0, 1, 0.5], "text": "صفحة 2", "confidence": 0.9}] | |
| result1 = normalize_ocr_output(blocks1, "p1.jpg", 100, 100, "tesseract", ["ar"]) | |
| result2 = normalize_ocr_output(blocks2, "p2.jpg", 100, 100, "tesseract", ["ar"]) | |
| merged = merge_pages([result1, result2]) | |
| assert merged["metadata"]["page_count"] == 2 | |
| assert merged["pages"][0]["page_index"] == 0 | |
| assert merged["pages"][1]["page_index"] == 1 | |
| class TestMixedLanguageHandler: | |
| """اختبار معالج اللغات المختلطة.""" | |
| def test_detect_language_arabic(self): | |
| """اختبار كشف اللغة العربية.""" | |
| from modules.nlp.mixed_language import MixedLanguageHandler | |
| handler = MixedLanguageHandler() | |
| assert handler.detect_language("مرحبا بالعالم") == "ar" | |
| def test_detect_language_english(self): | |
| """اختبار كشف اللغة الإنجليزية.""" | |
| from modules.nlp.mixed_language import MixedLanguageHandler | |
| handler = MixedLanguageHandler() | |
| assert handler.detect_language("Hello World") == "en" | |
| def test_detect_language_empty(self): | |
| """اختبار كشف لغة نص فارغ.""" | |
| from modules.nlp.mixed_language import MixedLanguageHandler | |
| handler = MixedLanguageHandler() | |
| assert handler.detect_language("") == "ar" | |
| def test_split_by_language(self): | |
| """اختبار تقسيم النص حسب اللغة.""" | |
| from modules.nlp.mixed_language import MixedLanguageHandler | |
| handler = MixedLanguageHandler() | |
| segments = handler.split_by_language("مرحبا Hello") | |
| assert len(segments) >= 2 | |
| # التحقق من وجود لغتين مختلفتين | |
| langs = [s[0] for s in segments] | |
| assert "ar" in langs | |
| assert "en" in langs | |
| def test_correct_arabic(self): | |
| """اختبار التصحيح العربي.""" | |
| from modules.nlp.mixed_language import MixedLanguageHandler | |
| handler = MixedLanguageHandler() | |
| # الحياة موجودة في القاموس | |
| result = handler.correct_text_mixed("الحياه") | |
| assert "الحياة" in result | |
| def test_get_ocr_language_params(self): | |
| """اختبار استخراج معلمات اللغات.""" | |
| from modules.nlp.mixed_language import MixedLanguageHandler | |
| handler = MixedLanguageHandler() | |
| langs = handler.get_ocr_language_params("مرحبا Hello world") | |
| assert "ar" in langs | |
| assert "en" in langs | |
| class TestLayoutExport: | |
| """اختبار التصدير المطابق للتنسيق.""" | |
| def test_export_to_docx_basic(self, tmp_path): | |
| """اختبار تصدير DOCX أساسي.""" | |
| from modules.export.layout_preserving import export_to_docx | |
| layout_data = { | |
| "blocks": [ | |
| {"type": "paragraph", "text": "مرحبا بالعالم", "bbox": [0, 0, 1, 1]}, | |
| {"type": "header", "text": "عنوان", "bbox": [0, 0, 1, 1]}, | |
| ] | |
| } | |
| output_path = str(tmp_path / "test.docx") | |
| result = export_to_docx(layout_data, output_path) | |
| assert os.path.exists(result) | |
| def test_layout_to_docx_from_json(self, tmp_path): | |
| """اختبار التصدير من JSON القياسي.""" | |
| from modules.export.layout_preserving import layout_to_docx | |
| from modules.vision.normalize import normalize_ocr_output, save_normalized | |
| raw_blocks = [ | |
| {"type": "paragraph", "bbox": [0.1, 0.1, 0.9, 0.2], "text": "مرحبا", "confidence": 0.95}, | |
| {"type": "header", "bbox": [0.1, 0.0, 0.9, 0.1], "text": "عنوان", "confidence": 0.9}, | |
| ] | |
| normalized = normalize_ocr_output( | |
| raw_blocks, "test.jpg", 2480, 3508, "tesseract", ["ar"] | |
| ) | |
| json_path = str(tmp_path / "result.json") | |
| save_normalized(normalized, json_path) | |
| docx_path = str(tmp_path / "output.docx") | |
| result = layout_to_docx(json_path, docx_path) | |
| assert os.path.exists(result) | |
| def test_layout_to_docx_with_table(self, tmp_path): | |
| """اختبار التصدير مع جدول.""" | |
| from modules.export.layout_preserving import layout_to_docx | |
| from modules.vision.normalize import normalize_ocr_output, save_normalized | |
| raw_blocks = [ | |
| { | |
| "type": "table", | |
| "bbox": [0.1, 0.1, 0.9, 0.5], | |
| "confidence": 0.85, | |
| "cells": [ | |
| ["اسم", "القيمة"], | |
| ["أ", "100"], | |
| ["ب", "200"], | |
| ], | |
| } | |
| ] | |
| normalized = normalize_ocr_output( | |
| raw_blocks, "test.jpg", 2480, 3508, "surya", ["ar"] | |
| ) | |
| json_path = str(tmp_path / "table_result.json") | |
| save_normalized(normalized, json_path) | |
| docx_path = str(tmp_path / "table_output.docx") | |
| result = layout_to_docx(json_path, docx_path) | |
| assert os.path.exists(result) | |
| def test_ocr_result_to_layout(self): | |
| """اختبار تحويل نتيجة OCR إلى layout.""" | |
| from modules.export.layout_preserving import ocr_result_to_layout | |
| ocr_json = { | |
| "blocks": [ | |
| {"type": "paragraph", "bbox": [0, 0, 1, 1], "text": "test"}, | |
| {"type": "table", "bbox": [0, 0, 1, 1], "cells": [["a", "b"]]}, | |
| ] | |
| } | |
| layout = ocr_result_to_layout(ocr_json, "img.jpg") | |
| assert "blocks" in layout | |
| assert len(layout["blocks"]) == 2 | |
| class TestTableDetection: | |
| """اختبار كاشف الجداول.""" | |
| def test_table_detection_init(self): | |
| """اختبار تهيئة كاشف الجداول.""" | |
| from modules.vision.table_detection import TableDetectionTransformer | |
| detector = TableDetectionTransformer(device="cpu") | |
| assert detector is not None | |
| assert detector.device == "cpu" | |
| def test_table_detection_without_model(self): | |
| """اختبار كشف الجداول بدون تحميل نموذج (يُرجع قائمة فارغة).""" | |
| from modules.vision.table_detection import TableDetectionTransformer | |
| detector = TableDetectionTransformer(device="cpu") | |
| # بدون تحميل النموذج فعلياً، detect_tables ستسجل تحذيراً وترجع [] | |
| tables = detector.detect_tables("nonexistent.jpg", threshold=0.5) | |
| assert isinstance(tables, list) | |