Spaces:

DrAbdulmalek
/

OmniFile-Processor

Sleeping

OmniFile-Processor / tests /test_integration.py

Dr. Abdulmalek

deploy: OmniFile AI Processor v4.3.0

900df0b 20 days ago

20 kB

	"""
	اختبارات التكامل (Integration Tests)
	========================================
	اختبار تفاعل الوحدات المختلفة مع بعضها.
	يشمل اختبارات الوحدات القديمة + اختبارات المكونات الجديدة:
	- Surya OCR
	- التطبيع (normalize)
	- كشف الجداول (table detection)
	- معالجة اللغات المختلطة
	- التصدير من JSON القياسي
	"""

	import json
	import os
	import pytest
	import tempfile
	from unittest.mock import MagicMock, patch


	class TestOCRToNLPIntegration:
	"""اختبار تسلسل OCR -> NLP."""

	def test_ocr_engine_initialization(self):
	"""اختبار تهيئة محرك OCR."""
	from modules.vision.ocr_engine import OCREngine
	engine = OCREngine(
	enable_trocr=False,
	enable_easyocr=False,
	enable_tesseract=False,
	enable_surya=False,
	enable_paddleocr=False,
	)
	assert engine is not None
	assert len(engine.get_available_engines()) == 0

	def test_spell_corrector_initialization(self):
	"""اختبار تهيئة المصحح الإملائي."""
	from modules.nlp.spell_corrector import SpellCorrector
	corrector = SpellCorrector()
	assert corrector is not None
	assert "en" in corrector.supported_languages
	assert "ar" in corrector.supported_languages
	assert "de" in corrector.supported_languages

	def test_spell_corrector_protected_terms(self):
	"""اختبار حماية المصطلحات التقنية."""
	from modules.nlp.spell_corrector import SpellCorrector
	corrector = SpellCorrector()

	# كلمات بايثون محجوزة
	assert corrector.correct_word("print") == "print"
	assert corrector.correct_word("numpy") == "numpy"
	assert corrector.correct_word("async") == "async"

	# أرقام
	assert corrector.correct_word("123") == "123"

	# مقاطع كود
	assert corrector.correct_word("my_variable") == "my_variable"

	def test_spell_corrector_english(self):
	"""اختبار التصحيح الإنجليزي."""
	from modules.nlp.spell_corrector import SpellCorrector
	corrector = SpellCorrector()

	result = corrector.correct_text("helloo world")
	assert result["corrected_text"] is not None
	assert isinstance(result["total_corrections"], int)

	def test_spell_corrector_batch(self):
	"""اختبار التصحيح المتوازي."""
	from modules.nlp.spell_corrector import SpellCorrector
	corrector = SpellCorrector()

	texts = ["helloo world", "testt text", "samplee data"]
	results = corrector.correct_batch(texts)

	assert len(results) == 3
	for result in results:
	assert "corrected_text" in result

	def test_ocr_engine_availability(self):
	"""اختبار توفر المحركات."""
	from modules.vision.ocr_engine import OCREngine
	engine = OCREngine()

	engines = engine.get_available_engines()
	assert isinstance(engines, list)
	for e in engines:
	assert "name" in e
	assert "available" in e
	assert "enabled" in e

	def test_ocr_engine_includes_surya(self):
	"""اختبار تضمين Surya في قائمة المحركات."""
	from modules.vision.ocr_engine import OCREngine
	engine = OCREngine()

	engines = engine.get_available_engines()
	engine_names = [e["name"] for e in engines]
	assert "Surya" in engine_names


	class TestModuleImports:
	"""اختبار استيراد جميع الوحدات."""

	def test_import_vision_modules(self):
	"""اختبار استيراد وحدة الرؤية."""
	from modules.vision import ocr_engine, image_preprocessor, text_reconstructor, pdf_processor
	assert ocr_engine is not None
	assert image_preprocessor is not None

	def test_import_nlp_modules(self):
	"""اختبار استيراد وحدة NLP."""
	from modules.nlp import spell_corrector, translator, summarizer, language_detector
	assert spell_corrector is not None

	def test_import_evaluation(self):
	"""اختبار استيراد وحدة التقييم."""
	from modules.evaluation import metrics
	assert metrics is not None

	def test_import_core_structure(self):
	"""اختبار استيراد النماذج الأساسية."""
	from modules.core.structure import BBox, OCRToken, DocumentPage, Document
	assert BBox is not None
	assert OCRToken is not None

	def test_import_export(self):
	"""اختبار استيراد وحدة التصدير."""
	from modules.export import exporter
	assert exporter is not None

	def test_import_security(self):
	"""اختبار استيراد وحدة الأمان."""
	from modules.security import secure_file_handler, sensitive_data_scanner
	assert secure_file_handler is not None

	def test_import_rtl(self):
	"""اختبار استيراد معالجة RTL."""
	from modules.nlp import arabic_rtl
	assert arabic_rtl is not None

	def test_import_new_normalize(self):
	"""اختبار استيراد وحدة التطبيع الجديدة."""
	from modules.vision.normalize import normalize_ocr_output
	assert normalize_ocr_output is not None

	def test_import_new_surya(self):
	"""اختبار استيراد محرك Surya — يتوقع ImportError إذا لم يُثبّت."""
	try:
	from modules.vision.surya_ocr import SuryaOCREngine
	assert SuryaOCREngine is not None
	except ImportError:
	pass # Surya غير مثبّت — سلوك متوقع

	def test_import_new_table_detection(self):
	"""اختبار استيراد كاشف الجداول."""
	from modules.vision.table_detection import TableDetectionTransformer
	assert TableDetectionTransformer is not None

	def test_import_new_mixed_language(self):
	"""اختبار استيراد معالج اللغات المختلطة."""
	from modules.nlp.mixed_language import MixedLanguageHandler
	assert MixedLanguageHandler is not None


	class TestConfigIntegration:
	"""اختبار تكامل الإعدادات."""

	def test_config_defaults(self):
	"""اختبار الإعدادات الافتراضية."""
	from config import OmniFileConfig
	cfg = OmniFileConfig()

	assert cfg.enable_trocr is True
	assert cfg.enable_easyocr is True
	assert cfg.enable_tesseract is True
	assert "en" in cfg.supported_languages
	assert "ar" in cfg.supported_languages
	assert "de" in cfg.supported_languages

	def test_config_save_load(self, tmp_path):
	"""اختبار حفظ وتحميل الإعدادات."""
	from config import OmniFileConfig
	cfg = OmniFileConfig(enable_paddleocr=True, fusion_strategy="voting")

	config_path = str(tmp_path / "test_config.json")
	cfg.save(config_path)

	loaded = OmniFileConfig.load(config_path)
	assert loaded.enable_paddleocr is True
	assert loaded.fusion_strategy == "voting"


	class TestResultFusion:
	"""اختبار دمج النتائج."""

	def test_fusion_empty_results(self):
	"""اختبار دمج نتائج فارغة."""
	from modules.vision.result_fusion import ResultFusion
	fusion = ResultFusion()
	result = fusion.fuse_page_results([])
	assert result is not None

	def test_fusion_single_result(self):
	"""اختبار دمج نتيجة واحدة."""
	from modules.vision.result_fusion import ResultFusion, LineResult, BoundingBox, PageResult
	fusion = ResultFusion()

	line = LineResult(
	text="test text",
	confidence=0.9,
	bbox=BoundingBox(x=0, y=0, width=100, height=30),
	words=[],
	block_type="paragraph",
	source_engine="easyocr"
	)
	page = PageResult(lines=[line])
	result = fusion.fuse_page_results([page])
	assert result is not None


	class TestMetricsIntegration:
	"""اختبار تكامل مقاييس الأداء."""

	def test_cer_perfect_match(self):
	"""اختبار CER مع تطابق مثالي."""
	from modules.evaluation.metrics import calculate_cer
	cer = calculate_cer("hello world", "hello world")
	assert cer == 0.0

	def test_wer_perfect_match(self):
	"""اختبار WER مع تطابق مثالي."""
	from modules.evaluation.metrics import calculate_wer
	wer = calculate_wer("hello world", "hello world")
	assert wer == 0.0

	def test_arabic_normalization(self):
	"""اختبار تطبيع النص العربي."""
	from modules.evaluation.metrics import _normalize_arabic

	# إزالة التشكيل
	normalized = _normalize_arabic("بِسْمِ اللهِ الرَّحْمٰنِ الرَّحِيمِ")
	assert "بسم" in normalized

	# توحيد الألف
	normalized = _normalize_arabic("أحمد إبراهيم")
	assert "ا" in normalized


	class TestNormalizeOCR:
	"""اختبار وحدة التطبيع الجديدة."""

	def test_normalize_basic(self):
	"""اختبار التطبيع الأساسي."""
	from modules.vision.normalize import normalize_ocr_output

	raw_blocks = [
	{
	"type": "paragraph",
	"bbox": [0.1, 0.2, 0.9, 0.3],
	"text": "مرحبا بالعالم",
	"confidence": 0.95,
	},
	{
	"type": "paragraph",
	"bbox": [0.1, 0.4, 0.9, 0.5],
	"text": "Hello World",
	"confidence": 0.90,
	},
	]

	result = normalize_ocr_output(
	raw_blocks, "test.jpg", 2480, 3508, "tesseract", ["ar", "en"]
	)

	assert "metadata" in result
	assert "pages" in result
	assert len(result["pages"]) == 1
	assert len(result["pages"][0]["blocks"]) == 2
	assert result["metadata"]["engine"] == "tesseract"
	assert result["pages"][0]["width"] == 2480
	assert result["pages"][0]["blocks"][0]["id"] == "block_1"

	def test_normalize_table(self):
	"""اختبار تطبيع كتل الجداول."""
	from modules.vision.normalize import normalize_ocr_output

	raw_blocks = [
	{
	"type": "table",
	"bbox": [0.1, 0.1, 0.9, 0.5],
	"confidence": 0.85,
	"cells": [
	["اسم", "العمر"],
	["أحمد", "25"],
	["سارة", "30"],
	],
	}
	]

	result = normalize_ocr_output(
	raw_blocks, "test.jpg", 2480, 3508, "surya", ["ar"]
	)

	block = result["pages"][0]["blocks"][0]
	assert block["type"] == "table"
	assert "structure" in block
	assert block["structure"]["rows"] == 3
	assert block["structure"]["cols"] == 2
	assert len(block["structure"]["cells"]) == 6

	def test_normalize_image_with_caption(self):
	"""اختبار تطبيع صور مع تسمية."""
	from modules.vision.normalize import normalize_ocr_output

	raw_blocks = [
	{
	"type": "image",
	"bbox": [0.1, 0.1, 0.9, 0.5],
	"image_file": "figure1.png",
	"caption": {
	"text": "شكل 1: مخطط النظام",
	"bbox": [0.2, 0.52, 0.8, 0.56],
	},
	}
	]

	result = normalize_ocr_output(
	raw_blocks, "test.jpg", 2480, 3508, "easyocr", ["ar"]
	)

	block = result["pages"][0]["blocks"][0]
	assert block["type"] == "image"
	assert block["image_file"] == "figure1.png"
	assert "caption" in block
	assert block["caption"]["text"] == "شكل 1: مخطط النظام"

	def test_normalize_save_and_load(self, tmp_path):
	"""اختبار حفظ وتحميل JSON الموحد."""
	from modules.vision.normalize import (
	normalize_ocr_output,
	save_normalized,
	load_normalized,
	)

	raw_blocks = [
	{"type": "paragraph", "bbox": [0, 0, 1, 1], "text": "test", "confidence": 0.9}
	]
	result = normalize_ocr_output(
	raw_blocks, "test.jpg", 100, 100, "tesseract", ["en"]
	)

	json_path = str(tmp_path / "result.json")
	save_normalized(result, json_path)

	loaded = load_normalized(json_path)
	assert loaded["metadata"]["engine"] == "tesseract"
	assert len(loaded["pages"]) == 1

	def test_merge_pages(self):
	"""اختبار دمج نتائج متعددة."""
	from modules.vision.normalize import normalize_ocr_output, merge_pages

	blocks1 = [{"type": "paragraph", "bbox": [0, 0, 1, 0.5], "text": "صفحة 1", "confidence": 0.9}]
	blocks2 = [{"type": "paragraph", "bbox": [0, 0, 1, 0.5], "text": "صفحة 2", "confidence": 0.9}]

	result1 = normalize_ocr_output(blocks1, "p1.jpg", 100, 100, "tesseract", ["ar"])
	result2 = normalize_ocr_output(blocks2, "p2.jpg", 100, 100, "tesseract", ["ar"])

	merged = merge_pages([result1, result2])
	assert merged["metadata"]["page_count"] == 2
	assert merged["pages"][0]["page_index"] == 0
	assert merged["pages"][1]["page_index"] == 1


	class TestMixedLanguageHandler:
	"""اختبار معالج اللغات المختلطة."""

	def test_detect_language_arabic(self):
	"""اختبار كشف اللغة العربية."""
	from modules.nlp.mixed_language import MixedLanguageHandler
	handler = MixedLanguageHandler()
	assert handler.detect_language("مرحبا بالعالم") == "ar"

	def test_detect_language_english(self):
	"""اختبار كشف اللغة الإنجليزية."""
	from modules.nlp.mixed_language import MixedLanguageHandler
	handler = MixedLanguageHandler()
	assert handler.detect_language("Hello World") == "en"

	def test_detect_language_empty(self):
	"""اختبار كشف لغة نص فارغ."""
	from modules.nlp.mixed_language import MixedLanguageHandler
	handler = MixedLanguageHandler()
	assert handler.detect_language("") == "ar"

	def test_split_by_language(self):
	"""اختبار تقسيم النص حسب اللغة."""
	from modules.nlp.mixed_language import MixedLanguageHandler
	handler = MixedLanguageHandler()
	segments = handler.split_by_language("مرحبا Hello")
	assert len(segments) >= 2
	# التحقق من وجود لغتين مختلفتين
	langs = [s[0] for s in segments]
	assert "ar" in langs
	assert "en" in langs

	def test_correct_arabic(self):
	"""اختبار التصحيح العربي."""
	from modules.nlp.mixed_language import MixedLanguageHandler
	handler = MixedLanguageHandler()
	# الحياة موجودة في القاموس
	result = handler.correct_text_mixed("الحياه")
	assert "الحياة" in result

	def test_get_ocr_language_params(self):
	"""اختبار استخراج معلمات اللغات."""
	from modules.nlp.mixed_language import MixedLanguageHandler
	handler = MixedLanguageHandler()
	langs = handler.get_ocr_language_params("مرحبا Hello world")
	assert "ar" in langs
	assert "en" in langs


	class TestLayoutExport:
	"""اختبار التصدير المطابق للتنسيق."""

	def test_export_to_docx_basic(self, tmp_path):
	"""اختبار تصدير DOCX أساسي."""
	from modules.export.layout_preserving import export_to_docx

	layout_data = {
	"blocks": [
	{"type": "paragraph", "text": "مرحبا بالعالم", "bbox": [0, 0, 1, 1]},
	{"type": "header", "text": "عنوان", "bbox": [0, 0, 1, 1]},
	]
	}
	output_path = str(tmp_path / "test.docx")
	result = export_to_docx(layout_data, output_path)
	assert os.path.exists(result)

	def test_layout_to_docx_from_json(self, tmp_path):
	"""اختبار التصدير من JSON القياسي."""
	from modules.export.layout_preserving import layout_to_docx
	from modules.vision.normalize import normalize_ocr_output, save_normalized

	raw_blocks = [
	{"type": "paragraph", "bbox": [0.1, 0.1, 0.9, 0.2], "text": "مرحبا", "confidence": 0.95},
	{"type": "header", "bbox": [0.1, 0.0, 0.9, 0.1], "text": "عنوان", "confidence": 0.9},
	]
	normalized = normalize_ocr_output(
	raw_blocks, "test.jpg", 2480, 3508, "tesseract", ["ar"]
	)
	json_path = str(tmp_path / "result.json")
	save_normalized(normalized, json_path)

	docx_path = str(tmp_path / "output.docx")
	result = layout_to_docx(json_path, docx_path)
	assert os.path.exists(result)

	def test_layout_to_docx_with_table(self, tmp_path):
	"""اختبار التصدير مع جدول."""
	from modules.export.layout_preserving import layout_to_docx
	from modules.vision.normalize import normalize_ocr_output, save_normalized

	raw_blocks = [
	{
	"type": "table",
	"bbox": [0.1, 0.1, 0.9, 0.5],
	"confidence": 0.85,
	"cells": [
	["اسم", "القيمة"],
	["أ", "100"],
	["ب", "200"],
	],
	}
	]
	normalized = normalize_ocr_output(
	raw_blocks, "test.jpg", 2480, 3508, "surya", ["ar"]
	)
	json_path = str(tmp_path / "table_result.json")
	save_normalized(normalized, json_path)

	docx_path = str(tmp_path / "table_output.docx")
	result = layout_to_docx(json_path, docx_path)
	assert os.path.exists(result)

	def test_ocr_result_to_layout(self):
	"""اختبار تحويل نتيجة OCR إلى layout."""
	from modules.export.layout_preserving import ocr_result_to_layout

	ocr_json = {
	"blocks": [
	{"type": "paragraph", "bbox": [0, 0, 1, 1], "text": "test"},
	{"type": "table", "bbox": [0, 0, 1, 1], "cells": [["a", "b"]]},
	]
	}
	layout = ocr_result_to_layout(ocr_json, "img.jpg")
	assert "blocks" in layout
	assert len(layout["blocks"]) == 2


	class TestTableDetection:
	"""اختبار كاشف الجداول."""

	def test_table_detection_init(self):
	"""اختبار تهيئة كاشف الجداول."""
	from modules.vision.table_detection import TableDetectionTransformer
	detector = TableDetectionTransformer(device="cpu")
	assert detector is not None
	assert detector.device == "cpu"

	def test_table_detection_without_model(self):
	"""اختبار كشف الجداول بدون تحميل نموذج (يُرجع قائمة فارغة)."""
	from modules.vision.table_detection import TableDetectionTransformer
	detector = TableDetectionTransformer(device="cpu")
	# بدون تحميل النموذج فعلياً، detect_tables ستسجل تحذيراً وترجع []
	tables = detector.detect_tables("nonexistent.jpg", threshold=0.5)
	assert isinstance(tables, list)