File size: 20,011 Bytes
900df0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
"""
اختبارات التكامل (Integration Tests)
========================================
اختبار تفاعل الوحدات المختلفة مع بعضها.
يشمل اختبارات الوحدات القديمة + اختبارات المكونات الجديدة:
- Surya OCR
- التطبيع (normalize)
- كشف الجداول (table detection)
- معالجة اللغات المختلطة
- التصدير من JSON القياسي
"""

import json
import os
import pytest
import tempfile
from unittest.mock import MagicMock, patch


class TestOCRToNLPIntegration:
    """اختبار تسلسل OCR -> NLP."""

    def test_ocr_engine_initialization(self):
        """اختبار تهيئة محرك OCR."""
        from modules.vision.ocr_engine import OCREngine
        engine = OCREngine(
            enable_trocr=False,
            enable_easyocr=False,
            enable_tesseract=False,
            enable_surya=False,
            enable_paddleocr=False,
        )
        assert engine is not None
        assert len(engine.get_available_engines()) == 0

    def test_spell_corrector_initialization(self):
        """اختبار تهيئة المصحح الإملائي."""
        from modules.nlp.spell_corrector import SpellCorrector
        corrector = SpellCorrector()
        assert corrector is not None
        assert "en" in corrector.supported_languages
        assert "ar" in corrector.supported_languages
        assert "de" in corrector.supported_languages

    def test_spell_corrector_protected_terms(self):
        """اختبار حماية المصطلحات التقنية."""
        from modules.nlp.spell_corrector import SpellCorrector
        corrector = SpellCorrector()

        # كلمات بايثون محجوزة
        assert corrector.correct_word("print") == "print"
        assert corrector.correct_word("numpy") == "numpy"
        assert corrector.correct_word("async") == "async"

        # أرقام
        assert corrector.correct_word("123") == "123"

        # مقاطع كود
        assert corrector.correct_word("my_variable") == "my_variable"

    def test_spell_corrector_english(self):
        """اختبار التصحيح الإنجليزي."""
        from modules.nlp.spell_corrector import SpellCorrector
        corrector = SpellCorrector()

        result = corrector.correct_text("helloo world")
        assert result["corrected_text"] is not None
        assert isinstance(result["total_corrections"], int)

    def test_spell_corrector_batch(self):
        """اختبار التصحيح المتوازي."""
        from modules.nlp.spell_corrector import SpellCorrector
        corrector = SpellCorrector()

        texts = ["helloo world", "testt text", "samplee data"]
        results = corrector.correct_batch(texts)

        assert len(results) == 3
        for result in results:
            assert "corrected_text" in result

    def test_ocr_engine_availability(self):
        """اختبار توفر المحركات."""
        from modules.vision.ocr_engine import OCREngine
        engine = OCREngine()

        engines = engine.get_available_engines()
        assert isinstance(engines, list)
        for e in engines:
            assert "name" in e
            assert "available" in e
            assert "enabled" in e

    def test_ocr_engine_includes_surya(self):
        """اختبار تضمين Surya في قائمة المحركات."""
        from modules.vision.ocr_engine import OCREngine
        engine = OCREngine()

        engines = engine.get_available_engines()
        engine_names = [e["name"] for e in engines]
        assert "Surya" in engine_names


class TestModuleImports:
    """اختبار استيراد جميع الوحدات."""

    def test_import_vision_modules(self):
        """اختبار استيراد وحدة الرؤية."""
        from modules.vision import ocr_engine, image_preprocessor, text_reconstructor, pdf_processor
        assert ocr_engine is not None
        assert image_preprocessor is not None

    def test_import_nlp_modules(self):
        """اختبار استيراد وحدة NLP."""
        from modules.nlp import spell_corrector, translator, summarizer, language_detector
        assert spell_corrector is not None

    def test_import_evaluation(self):
        """اختبار استيراد وحدة التقييم."""
        from modules.evaluation import metrics
        assert metrics is not None

    def test_import_core_structure(self):
        """اختبار استيراد النماذج الأساسية."""
        from modules.core.structure import BBox, OCRToken, DocumentPage, Document
        assert BBox is not None
        assert OCRToken is not None

    def test_import_export(self):
        """اختبار استيراد وحدة التصدير."""
        from modules.export import exporter
        assert exporter is not None

    def test_import_security(self):
        """اختبار استيراد وحدة الأمان."""
        from modules.security import secure_file_handler, sensitive_data_scanner
        assert secure_file_handler is not None

    def test_import_rtl(self):
        """اختبار استيراد معالجة RTL."""
        from modules.nlp import arabic_rtl
        assert arabic_rtl is not None

    def test_import_new_normalize(self):
        """اختبار استيراد وحدة التطبيع الجديدة."""
        from modules.vision.normalize import normalize_ocr_output
        assert normalize_ocr_output is not None

    def test_import_new_surya(self):
        """اختبار استيراد محرك Surya — يتوقع ImportError إذا لم يُثبّت."""
        try:
            from modules.vision.surya_ocr import SuryaOCREngine
            assert SuryaOCREngine is not None
        except ImportError:
            pass  # Surya غير مثبّت — سلوك متوقع

    def test_import_new_table_detection(self):
        """اختبار استيراد كاشف الجداول."""
        from modules.vision.table_detection import TableDetectionTransformer
        assert TableDetectionTransformer is not None

    def test_import_new_mixed_language(self):
        """اختبار استيراد معالج اللغات المختلطة."""
        from modules.nlp.mixed_language import MixedLanguageHandler
        assert MixedLanguageHandler is not None


class TestConfigIntegration:
    """اختبار تكامل الإعدادات."""

    def test_config_defaults(self):
        """اختبار الإعدادات الافتراضية."""
        from config import OmniFileConfig
        cfg = OmniFileConfig()

        assert cfg.enable_trocr is True
        assert cfg.enable_easyocr is True
        assert cfg.enable_tesseract is True
        assert "en" in cfg.supported_languages
        assert "ar" in cfg.supported_languages
        assert "de" in cfg.supported_languages

    def test_config_save_load(self, tmp_path):
        """اختبار حفظ وتحميل الإعدادات."""
        from config import OmniFileConfig
        cfg = OmniFileConfig(enable_paddleocr=True, fusion_strategy="voting")

        config_path = str(tmp_path / "test_config.json")
        cfg.save(config_path)

        loaded = OmniFileConfig.load(config_path)
        assert loaded.enable_paddleocr is True
        assert loaded.fusion_strategy == "voting"


class TestResultFusion:
    """اختبار دمج النتائج."""

    def test_fusion_empty_results(self):
        """اختبار دمج نتائج فارغة."""
        from modules.vision.result_fusion import ResultFusion
        fusion = ResultFusion()
        result = fusion.fuse_page_results([])
        assert result is not None

    def test_fusion_single_result(self):
        """اختبار دمج نتيجة واحدة."""
        from modules.vision.result_fusion import ResultFusion, LineResult, BoundingBox, PageResult
        fusion = ResultFusion()

        line = LineResult(
            text="test text",
            confidence=0.9,
            bbox=BoundingBox(x=0, y=0, width=100, height=30),
            words=[],
            block_type="paragraph",
            source_engine="easyocr"
        )
        page = PageResult(lines=[line])
        result = fusion.fuse_page_results([page])
        assert result is not None


class TestMetricsIntegration:
    """اختبار تكامل مقاييس الأداء."""

    def test_cer_perfect_match(self):
        """اختبار CER مع تطابق مثالي."""
        from modules.evaluation.metrics import calculate_cer
        cer = calculate_cer("hello world", "hello world")
        assert cer == 0.0

    def test_wer_perfect_match(self):
        """اختبار WER مع تطابق مثالي."""
        from modules.evaluation.metrics import calculate_wer
        wer = calculate_wer("hello world", "hello world")
        assert wer == 0.0

    def test_arabic_normalization(self):
        """اختبار تطبيع النص العربي."""
        from modules.evaluation.metrics import _normalize_arabic

        # إزالة التشكيل
        normalized = _normalize_arabic("بِسْمِ اللهِ الرَّحْمٰنِ الرَّحِيمِ")
        assert "بسم" in normalized

        # توحيد الألف
        normalized = _normalize_arabic("أحمد إبراهيم")
        assert "ا" in normalized


class TestNormalizeOCR:
    """اختبار وحدة التطبيع الجديدة."""

    def test_normalize_basic(self):
        """اختبار التطبيع الأساسي."""
        from modules.vision.normalize import normalize_ocr_output

        raw_blocks = [
            {
                "type": "paragraph",
                "bbox": [0.1, 0.2, 0.9, 0.3],
                "text": "مرحبا بالعالم",
                "confidence": 0.95,
            },
            {
                "type": "paragraph",
                "bbox": [0.1, 0.4, 0.9, 0.5],
                "text": "Hello World",
                "confidence": 0.90,
            },
        ]

        result = normalize_ocr_output(
            raw_blocks, "test.jpg", 2480, 3508, "tesseract", ["ar", "en"]
        )

        assert "metadata" in result
        assert "pages" in result
        assert len(result["pages"]) == 1
        assert len(result["pages"][0]["blocks"]) == 2
        assert result["metadata"]["engine"] == "tesseract"
        assert result["pages"][0]["width"] == 2480
        assert result["pages"][0]["blocks"][0]["id"] == "block_1"

    def test_normalize_table(self):
        """اختبار تطبيع كتل الجداول."""
        from modules.vision.normalize import normalize_ocr_output

        raw_blocks = [
            {
                "type": "table",
                "bbox": [0.1, 0.1, 0.9, 0.5],
                "confidence": 0.85,
                "cells": [
                    ["اسم", "العمر"],
                    ["أحمد", "25"],
                    ["سارة", "30"],
                ],
            }
        ]

        result = normalize_ocr_output(
            raw_blocks, "test.jpg", 2480, 3508, "surya", ["ar"]
        )

        block = result["pages"][0]["blocks"][0]
        assert block["type"] == "table"
        assert "structure" in block
        assert block["structure"]["rows"] == 3
        assert block["structure"]["cols"] == 2
        assert len(block["structure"]["cells"]) == 6

    def test_normalize_image_with_caption(self):
        """اختبار تطبيع صور مع تسمية."""
        from modules.vision.normalize import normalize_ocr_output

        raw_blocks = [
            {
                "type": "image",
                "bbox": [0.1, 0.1, 0.9, 0.5],
                "image_file": "figure1.png",
                "caption": {
                    "text": "شكل 1: مخطط النظام",
                    "bbox": [0.2, 0.52, 0.8, 0.56],
                },
            }
        ]

        result = normalize_ocr_output(
            raw_blocks, "test.jpg", 2480, 3508, "easyocr", ["ar"]
        )

        block = result["pages"][0]["blocks"][0]
        assert block["type"] == "image"
        assert block["image_file"] == "figure1.png"
        assert "caption" in block
        assert block["caption"]["text"] == "شكل 1: مخطط النظام"

    def test_normalize_save_and_load(self, tmp_path):
        """اختبار حفظ وتحميل JSON الموحد."""
        from modules.vision.normalize import (
            normalize_ocr_output,
            save_normalized,
            load_normalized,
        )

        raw_blocks = [
            {"type": "paragraph", "bbox": [0, 0, 1, 1], "text": "test", "confidence": 0.9}
        ]
        result = normalize_ocr_output(
            raw_blocks, "test.jpg", 100, 100, "tesseract", ["en"]
        )

        json_path = str(tmp_path / "result.json")
        save_normalized(result, json_path)

        loaded = load_normalized(json_path)
        assert loaded["metadata"]["engine"] == "tesseract"
        assert len(loaded["pages"]) == 1

    def test_merge_pages(self):
        """اختبار دمج نتائج متعددة."""
        from modules.vision.normalize import normalize_ocr_output, merge_pages

        blocks1 = [{"type": "paragraph", "bbox": [0, 0, 1, 0.5], "text": "صفحة 1", "confidence": 0.9}]
        blocks2 = [{"type": "paragraph", "bbox": [0, 0, 1, 0.5], "text": "صفحة 2", "confidence": 0.9}]

        result1 = normalize_ocr_output(blocks1, "p1.jpg", 100, 100, "tesseract", ["ar"])
        result2 = normalize_ocr_output(blocks2, "p2.jpg", 100, 100, "tesseract", ["ar"])

        merged = merge_pages([result1, result2])
        assert merged["metadata"]["page_count"] == 2
        assert merged["pages"][0]["page_index"] == 0
        assert merged["pages"][1]["page_index"] == 1


class TestMixedLanguageHandler:
    """اختبار معالج اللغات المختلطة."""

    def test_detect_language_arabic(self):
        """اختبار كشف اللغة العربية."""
        from modules.nlp.mixed_language import MixedLanguageHandler
        handler = MixedLanguageHandler()
        assert handler.detect_language("مرحبا بالعالم") == "ar"

    def test_detect_language_english(self):
        """اختبار كشف اللغة الإنجليزية."""
        from modules.nlp.mixed_language import MixedLanguageHandler
        handler = MixedLanguageHandler()
        assert handler.detect_language("Hello World") == "en"

    def test_detect_language_empty(self):
        """اختبار كشف لغة نص فارغ."""
        from modules.nlp.mixed_language import MixedLanguageHandler
        handler = MixedLanguageHandler()
        assert handler.detect_language("") == "ar"

    def test_split_by_language(self):
        """اختبار تقسيم النص حسب اللغة."""
        from modules.nlp.mixed_language import MixedLanguageHandler
        handler = MixedLanguageHandler()
        segments = handler.split_by_language("مرحبا Hello")
        assert len(segments) >= 2
        # التحقق من وجود لغتين مختلفتين
        langs = [s[0] for s in segments]
        assert "ar" in langs
        assert "en" in langs

    def test_correct_arabic(self):
        """اختبار التصحيح العربي."""
        from modules.nlp.mixed_language import MixedLanguageHandler
        handler = MixedLanguageHandler()
        # الحياة موجودة في القاموس
        result = handler.correct_text_mixed("الحياه")
        assert "الحياة" in result

    def test_get_ocr_language_params(self):
        """اختبار استخراج معلمات اللغات."""
        from modules.nlp.mixed_language import MixedLanguageHandler
        handler = MixedLanguageHandler()
        langs = handler.get_ocr_language_params("مرحبا Hello world")
        assert "ar" in langs
        assert "en" in langs


class TestLayoutExport:
    """اختبار التصدير المطابق للتنسيق."""

    def test_export_to_docx_basic(self, tmp_path):
        """اختبار تصدير DOCX أساسي."""
        from modules.export.layout_preserving import export_to_docx

        layout_data = {
            "blocks": [
                {"type": "paragraph", "text": "مرحبا بالعالم", "bbox": [0, 0, 1, 1]},
                {"type": "header", "text": "عنوان", "bbox": [0, 0, 1, 1]},
            ]
        }
        output_path = str(tmp_path / "test.docx")
        result = export_to_docx(layout_data, output_path)
        assert os.path.exists(result)

    def test_layout_to_docx_from_json(self, tmp_path):
        """اختبار التصدير من JSON القياسي."""
        from modules.export.layout_preserving import layout_to_docx
        from modules.vision.normalize import normalize_ocr_output, save_normalized

        raw_blocks = [
            {"type": "paragraph", "bbox": [0.1, 0.1, 0.9, 0.2], "text": "مرحبا", "confidence": 0.95},
            {"type": "header", "bbox": [0.1, 0.0, 0.9, 0.1], "text": "عنوان", "confidence": 0.9},
        ]
        normalized = normalize_ocr_output(
            raw_blocks, "test.jpg", 2480, 3508, "tesseract", ["ar"]
        )
        json_path = str(tmp_path / "result.json")
        save_normalized(normalized, json_path)

        docx_path = str(tmp_path / "output.docx")
        result = layout_to_docx(json_path, docx_path)
        assert os.path.exists(result)

    def test_layout_to_docx_with_table(self, tmp_path):
        """اختبار التصدير مع جدول."""
        from modules.export.layout_preserving import layout_to_docx
        from modules.vision.normalize import normalize_ocr_output, save_normalized

        raw_blocks = [
            {
                "type": "table",
                "bbox": [0.1, 0.1, 0.9, 0.5],
                "confidence": 0.85,
                "cells": [
                    ["اسم", "القيمة"],
                    ["أ", "100"],
                    ["ب", "200"],
                ],
            }
        ]
        normalized = normalize_ocr_output(
            raw_blocks, "test.jpg", 2480, 3508, "surya", ["ar"]
        )
        json_path = str(tmp_path / "table_result.json")
        save_normalized(normalized, json_path)

        docx_path = str(tmp_path / "table_output.docx")
        result = layout_to_docx(json_path, docx_path)
        assert os.path.exists(result)

    def test_ocr_result_to_layout(self):
        """اختبار تحويل نتيجة OCR إلى layout."""
        from modules.export.layout_preserving import ocr_result_to_layout

        ocr_json = {
            "blocks": [
                {"type": "paragraph", "bbox": [0, 0, 1, 1], "text": "test"},
                {"type": "table", "bbox": [0, 0, 1, 1], "cells": [["a", "b"]]},
            ]
        }
        layout = ocr_result_to_layout(ocr_json, "img.jpg")
        assert "blocks" in layout
        assert len(layout["blocks"]) == 2


class TestTableDetection:
    """اختبار كاشف الجداول."""

    def test_table_detection_init(self):
        """اختبار تهيئة كاشف الجداول."""
        from modules.vision.table_detection import TableDetectionTransformer
        detector = TableDetectionTransformer(device="cpu")
        assert detector is not None
        assert detector.device == "cpu"

    def test_table_detection_without_model(self):
        """اختبار كشف الجداول بدون تحميل نموذج (يُرجع قائمة فارغة)."""
        from modules.vision.table_detection import TableDetectionTransformer
        detector = TableDetectionTransformer(device="cpu")
        # بدون تحميل النموذج فعلياً، detect_tables ستسجل تحذيراً وترجع []
        tables = detector.detect_tables("nonexistent.jpg", threshold=0.5)
        assert isinstance(tables, list)