Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

File size: 24,119 Bytes

"""Tests Sprint 13 — Corrections structurelles : parallelisation, exceptions, statistiques.

Classes de tests
----------------
TestPyprojectCorrections     (4 tests)  — Part 1 : Beta, deps clarifiées
TestEngineExecutionMode      (5 tests)  — Part 2 : execution_mode sur les classes moteur
TestRunnerParallelParams     (5 tests)  — Part 3 : signature run_benchmark étendue
TestRunnerTimeout            (3 tests)  — Part 3 : timeout par document
TestRunnerPartialResults     (4 tests)  — Part 3 : sauvegarde / reprise partiels
TestRunnerSilentExceptions   (3 tests)  — Part 2 : warnings au lieu de pass silencieux
TestWilcoxonValidation       (7 tests)  — Part 4 : valeurs de référence connues
TestWilcoxonScipyIntegration (3 tests)  — Part 4 : cohérence scipy / natif
"""

from __future__ import annotations

import inspect
import json
import math
from pathlib import Path
from unittest.mock import patch

import pytest

ROOT = Path(__file__).parent.parent


# ===========================================================================
# Fixtures
# ===========================================================================

@pytest.fixture
def tmp_corpus(tmp_path):
    """Corpus minimal de 3 documents pour les tests runner."""
    from PIL import Image
    for i in range(3):
        img = Image.new("RGB", (100, 30), color="white")
        img.save(tmp_path / f"doc{i:02d}.png")
        (tmp_path / f"doc{i:02d}.gt.txt").write_text(f"texte vérité {i}", encoding="utf-8")
    return tmp_path


# ===========================================================================
# Part 1 — Corrections pyproject.toml
# ===========================================================================

class TestPyprojectCorrections:

    def _read_pyproject(self) -> str:
        return (ROOT / "pyproject.toml").read_text(encoding="utf-8")

    def test_classifier_is_beta(self):
        """Le classifier doit être 4 - Beta et non 5 - Production/Stable."""
        content = self._read_pyproject()
        assert "Development Status :: 4 - Beta" in content, (
            "pyproject.toml doit contenir 'Development Status :: 4 - Beta'"
        )
        assert "Production/Stable" not in content, (
            "pyproject.toml ne doit plus contenir 'Production/Stable'"
        )

    def test_fastapi_not_in_base_deps(self):
        """fastapi ne doit pas être dans les dépendances de base."""
        import re
        content = self._read_pyproject()
        # Extraire la section dependencies = [...] sous [project] (avant la 1re section suivante)
        m = re.search(r"^dependencies\s*=\s*\[(.*?)\]", content, re.DOTALL | re.MULTILINE)
        assert m, "Section dependencies introuvable dans pyproject.toml"
        base_deps = m.group(1)
        assert "fastapi" not in base_deps, (
            "fastapi ne doit pas être dans les dépendances de base — seulement dans [web]"
        )

    def test_httpx_not_in_base_deps(self):
        """httpx ne doit pas être dans les dépendances de base."""
        import re
        content = self._read_pyproject()
        m = re.search(r"^dependencies\s*=\s*\[(.*?)\]", content, re.DOTALL | re.MULTILINE)
        assert m
        base_deps = m.group(1)
        assert "httpx" not in base_deps, (
            "httpx ne doit pas être dans les dépendances de base — seulement dans [web]"
        )

    def test_web_extra_has_fastapi_httpx_multipart(self):
        """L'extra [web] doit contenir fastapi, httpx et python-multipart."""
        import tomllib
        with (ROOT / "pyproject.toml").open("rb") as fh:
            data = tomllib.load(fh)
        web_deps = " ".join(data["project"]["optional-dependencies"]["web"])
        assert "fastapi" in web_deps
        assert "httpx" in web_deps
        assert "python-multipart" in web_deps


# ===========================================================================
# Part 2 — execution_mode sur les classes moteur
# ===========================================================================

class TestEngineExecutionMode:

    def test_base_engine_default_mode_is_io(self):
        """BaseOCREngine doit avoir execution_mode = 'io' par défaut."""
        from picarones.engines.base import BaseOCREngine
        assert BaseOCREngine.execution_mode == "io"

    def test_tesseract_engine_mode_is_cpu(self):
        """TesseractEngine doit avoir execution_mode = 'cpu'."""
        from picarones.engines.tesseract import TesseractEngine
        assert TesseractEngine.execution_mode == "cpu"

    def test_pero_engine_mode_is_cpu(self):
        """PeroOCREngine doit avoir execution_mode = 'cpu'."""
        from picarones.engines.pero_ocr import PeroOCREngine
        assert PeroOCREngine.execution_mode == "cpu"

    def test_mistral_engine_default_mode_is_io(self):
        """MistralOCREngine doit hériter execution_mode = 'io'."""
        from picarones.engines.mistral_ocr import MistralOCREngine
        assert MistralOCREngine.execution_mode == "io"

    def test_google_vision_default_mode_is_io(self):
        """GoogleVisionEngine doit hériter execution_mode = 'io'."""
        from picarones.engines.google_vision import GoogleVisionEngine
        assert GoogleVisionEngine.execution_mode == "io"


# ===========================================================================
# Part 3 — Signature run_benchmark étendue
# ===========================================================================

class TestRunnerParallelParams:

    def test_max_workers_param_exists(self):
        """run_benchmark doit accepter max_workers."""
        from picarones.core.runner import run_benchmark
        sig = inspect.signature(run_benchmark)
        assert "max_workers" in sig.parameters

    def test_max_workers_default_is_4(self):
        """max_workers doit avoir 4 comme valeur par défaut."""
        from picarones.core.runner import run_benchmark
        sig = inspect.signature(run_benchmark)
        assert sig.parameters["max_workers"].default == 4

    def test_timeout_seconds_param_exists(self):
        """run_benchmark doit accepter timeout_seconds."""
        from picarones.core.runner import run_benchmark
        sig = inspect.signature(run_benchmark)
        assert "timeout_seconds" in sig.parameters

    def test_timeout_seconds_default_is_60(self):
        """timeout_seconds doit avoir 60.0 comme valeur par défaut."""
        from picarones.core.runner import run_benchmark
        sig = inspect.signature(run_benchmark)
        assert sig.parameters["timeout_seconds"].default == 60.0

    def test_partial_dir_param_exists(self):
        """run_benchmark doit accepter partial_dir (None par défaut)."""
        from picarones.core.runner import run_benchmark
        sig = inspect.signature(run_benchmark)
        assert "partial_dir" in sig.parameters
        assert sig.parameters["partial_dir"].default is None


# ===========================================================================
# Part 3 — Timeout par document
# ===========================================================================

class TestRunnerTimeout:

    def test_timeout_doc_result_has_error(self, tmp_corpus):
        """Un document ayant dépassé le timeout doit avoir engine_error contenant 'timeout'."""
        from picarones.core.corpus import load_corpus_from_directory
        from picarones.core.runner import run_benchmark
        from picarones.engines.base import BaseOCREngine
        import time

        class SlowEngine(BaseOCREngine):
            @property
            def name(self): return "slow_engine"
            def version(self): return "0.1"
            def _run_ocr(self, image_path):
                time.sleep(5)  # 5 secondes — dépasse le timeout de 1s
                return "jamais atteint"

        corpus = load_corpus_from_directory(str(tmp_corpus))
        result = run_benchmark(
            corpus, [SlowEngine()],
            show_progress=False,
            timeout_seconds=1.0,
            max_workers=1,
        )
        assert len(result.engine_reports) == 1
        report = result.engine_reports[0]
        assert len(report.document_results) == len(corpus)
        # Au moins un document doit être marqué timeout
        timeout_docs = [dr for dr in report.document_results if dr.engine_error and "timeout" in dr.engine_error]
        assert len(timeout_docs) > 0, "Aucun document marqué timeout — le timeout ne fonctionne pas"

    def test_timeout_doc_result_cer_is_one(self, tmp_corpus):
        """Un document timeout doit avoir CER = 1.0."""
        from picarones.core.corpus import load_corpus_from_directory
        from picarones.core.runner import run_benchmark
        from picarones.engines.base import BaseOCREngine
        import time

        class SlowEngine(BaseOCREngine):
            @property
            def name(self): return "slow"
            def version(self): return "0.1"
            def _run_ocr(self, image_path):
                time.sleep(5)
                return ""

        corpus = load_corpus_from_directory(str(tmp_corpus))
        result = run_benchmark(
            corpus, [SlowEngine()],
            show_progress=False,
            timeout_seconds=1.0,
            max_workers=1,
        )
        for dr in result.engine_reports[0].document_results:
            if dr.engine_error and "timeout" in dr.engine_error:
                assert dr.metrics.cer == 1.0

    def test_fast_docs_not_affected_by_timeout(self, tmp_corpus):
        """Des documents rapides ne doivent pas être touchés par un timeout généreux."""
        from picarones.core.corpus import load_corpus_from_directory
        from picarones.core.runner import run_benchmark
        from picarones.engines.base import BaseOCREngine

        class FastEngine(BaseOCREngine):
            @property
            def name(self): return "fast"
            def version(self): return "0.1"
            def _run_ocr(self, image_path): return "texte ocr"

        corpus = load_corpus_from_directory(str(tmp_corpus))
        result = run_benchmark(
            corpus, [FastEngine()],
            show_progress=False,
            timeout_seconds=30.0,
        )
        timeout_docs = [
            dr for dr in result.engine_reports[0].document_results
            if dr.engine_error and "timeout" in dr.engine_error
        ]
        assert len(timeout_docs) == 0, "Les documents rapides ne doivent pas être marqués timeout"


# ===========================================================================
# Part 3 — Résultats partiels (sauvegarde / reprise)
# ===========================================================================

class TestRunnerPartialResults:

    def test_partial_file_created_during_run(self, tmp_corpus, tmp_path):
        """_save_partial_line doit être appelée pour chaque document traité."""
        from picarones.core.corpus import load_corpus_from_directory
        from picarones.core.runner import run_benchmark
        from picarones.engines.base import BaseOCREngine
        import picarones.core.runner as runner_mod

        save_calls: list[str] = []
        original_save = runner_mod._save_partial_line

        def tracking_save(path, doc_result):
            save_calls.append(doc_result.doc_id)
            original_save(path, doc_result)

        class MockEngine(BaseOCREngine):
            @property
            def name(self): return "mock"
            def version(self): return "0.1"
            def _run_ocr(self, image_path): return "texte"

        corpus = load_corpus_from_directory(str(tmp_corpus))
        with patch.object(runner_mod, "_save_partial_line", side_effect=tracking_save):
            run_benchmark(
                corpus, [MockEngine()],
                show_progress=False,
                partial_dir=str(tmp_path),
            )
        assert len(save_calls) == len(corpus), (
            f"_save_partial_line appelée {len(save_calls)} fois, attendu {len(corpus)}"
        )

    def test_partial_file_deleted_after_success(self, tmp_corpus, tmp_path):
        """Le fichier .partial.json doit être supprimé après un benchmark réussi."""
        from picarones.core.corpus import load_corpus_from_directory
        from picarones.core.runner import run_benchmark
        from picarones.engines.base import BaseOCREngine

        class MockEngine(BaseOCREngine):
            @property
            def name(self): return "mock"
            def version(self): return "0.1"
            def _run_ocr(self, image_path): return "texte"

        corpus = load_corpus_from_directory(str(tmp_corpus))
        run_benchmark(
            corpus, [MockEngine()],
            show_progress=False,
            partial_dir=str(tmp_path),
        )
        partial_files = list(tmp_path.glob("*.partial.json"))
        assert len(partial_files) == 0, f"Fichier(s) partiel(s) non supprimé(s) : {partial_files}"

    def test_partial_load_skips_already_done_docs(self, tmp_corpus, tmp_path):
        """La reprise depuis un fichier partiel doit sauter les documents déjà traités."""
        from picarones.core.corpus import load_corpus_from_directory
        from picarones.core.runner import _load_partial, _partial_path

        corpus = load_corpus_from_directory(str(tmp_corpus))
        corpus_name = corpus.name
        engine_name = "mock_engine"

        # Créer un fichier partiel simulant 1 document déjà traité
        path = _partial_path(corpus_name, engine_name, tmp_path)
        doc = corpus.documents[0]
        partial_line = {
            "doc_id": doc.doc_id,
            "image_path": str(doc.image_path),
            "ground_truth": doc.ground_truth,
            "hypothesis": "déjà traité",
            "metrics": {"cer": 0.1, "cer_nfc": 0.1, "cer_caseless": 0.1,
                        "wer": 0.1, "wer_normalized": 0.1, "mer": 0.1, "wil": 0.1,
                        "reference_length": 10, "hypothesis_length": 10},
            "duration_seconds": 0.5,
        }
        path.write_text(json.dumps(partial_line) + "\n", encoding="utf-8")

        _, loaded = _load_partial(corpus_name, engine_name, tmp_path)
        assert len(loaded) == 1
        assert loaded[0].doc_id == doc.doc_id
        assert loaded[0].hypothesis == "déjà traité"

    def test_partial_load_returns_empty_for_missing_file(self, tmp_path):
        """Si aucun fichier partiel n'existe, la liste doit être vide."""
        from picarones.core.runner import _load_partial
        _, loaded = _load_partial("corpus_inexistant", "moteur_inexistant", tmp_path)
        assert loaded == []


# ===========================================================================
# Part 2 — Exceptions non silencieuses dans le runner
# ===========================================================================

class TestRunnerSilentExceptions:

    def test_confusion_failure_logs_warning(self, tmp_corpus, caplog):
        """Une erreur dans build_confusion_matrix doit être loguée, pas ignorée."""
        import logging
        from picarones.core.corpus import load_corpus_from_directory
        from picarones.core.runner import run_benchmark
        from picarones.engines.base import BaseOCREngine

        class MockEngine(BaseOCREngine):
            @property
            def name(self): return "mock"
            def version(self): return "0.1"
            def _run_ocr(self, image_path): return "texte ocr"

        corpus = load_corpus_from_directory(str(tmp_corpus))
        with patch(
            "picarones.core.runner._compute_document_result",
            wraps=__import__("picarones.core.runner", fromlist=["_compute_document_result"])._compute_document_result,
        ):
            with patch("picarones.core.confusion.build_confusion_matrix", side_effect=RuntimeError("crash test")):
                with caplog.at_level(logging.WARNING):
                    result = run_benchmark(corpus, [MockEngine()], show_progress=False)

        assert result is not None, "Le benchmark ne doit pas planter si la confusion matrix échoue"
        # La clé est que le benchmark se termine normalement
        assert len(result.engine_reports) == 1

    def test_progress_callback_failure_logs_warning(self, tmp_corpus, caplog):
        """Une exception dans le progress_callback doit être loguée, pas propagée."""
        import logging
        from picarones.core.corpus import load_corpus_from_directory
        from picarones.core.runner import run_benchmark
        from picarones.engines.base import BaseOCREngine

        class MockEngine(BaseOCREngine):
            @property
            def name(self): return "mock"
            def version(self): return "0.1"
            def _run_ocr(self, image_path): return "texte"

        def bad_callback(engine_name, doc_idx, doc_id):
            raise ValueError("callback crash")

        corpus = load_corpus_from_directory(str(tmp_corpus))
        with caplog.at_level(logging.WARNING):
            result = run_benchmark(
                corpus, [MockEngine()],
                show_progress=False,
                progress_callback=bad_callback,
            )
        assert result is not None
        assert any("progress_callback" in r.message for r in caplog.records), (
            "L'exception du callback doit être loguée en WARNING"
        )

    def test_aggregate_helpers_log_on_failure(self, caplog):
        """Les helpers _aggregate_* doivent logger en WARNING et retourner None."""
        import logging
        from picarones.core.runner import _aggregate_confusion

        # Créer un doc_result avec des données de confusion corrompues
        from picarones.core.results import DocumentResult
        from picarones.core.metrics import MetricsResult
        bad_dr = DocumentResult(
            doc_id="x", image_path="x.png", ground_truth="gt", hypothesis="hyp",
            metrics=MetricsResult(cer=0.1, cer_nfc=0.1, cer_caseless=0.1,
                                   wer=0.1, wer_normalized=0.1, mer=0.1, wil=0.1,
                                   reference_length=2, hypothesis_length=2),
            duration_seconds=0.1,
            confusion_matrix={"invalid_key": "corrupt_data"},  # va planter ConfusionMatrix(**...)
        )
        with caplog.at_level(logging.WARNING):
            result = _aggregate_confusion([bad_dr])
        assert result is None
        assert any("aggregate_confusion" in r.message for r in caplog.records)


# ===========================================================================
# Part 4 — Validation du test de Wilcoxon contre valeurs de référence
# ===========================================================================

class TestWilcoxonValidation:

    def test_identical_sequences_not_significant(self):
        """Séquences identiques → pas de différence, p = 1.0, significant = False."""
        from picarones.core.statistics import wilcoxon_test
        a = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
        r = wilcoxon_test(a, a)
        assert r["significant"] is False
        assert r["p_value"] == 1.0
        assert r["n_pairs"] == 0

    def test_all_positive_diffs_w_minus_is_zero(self):
        """Si toutes les différences a−b sont positives : W⁻ = 0, W⁺ = n(n+1)/2."""
        from picarones.core.statistics import wilcoxon_test
        n = 10
        a = [float(i) for i in range(1, n + 1)]
        b = [0.0] * n
        r = wilcoxon_test(a, b)
        expected_total = n * (n + 1) / 2.0
        assert math.isclose(r["W_minus"], 0.0, abs_tol=1e-9)
        assert math.isclose(r["W_plus"], expected_total, abs_tol=1e-9)

    def test_w_plus_w_minus_sum_invariant(self):
        """W⁺ + W⁻ doit toujours être égal à n(n+1)/2 (n = nombre de paires non nulles)."""
        from picarones.core.statistics import wilcoxon_test
        a = [0.10, 0.25, 0.05, 0.40, 0.30, 0.15, 0.20, 0.35, 0.08, 0.18]
        b = [0.12, 0.20, 0.08, 0.35, 0.28, 0.18, 0.15, 0.40, 0.10, 0.20]
        r = wilcoxon_test(a, b)
        n = r["n_pairs"]
        expected = n * (n + 1) / 2.0
        actual = r["W_plus"] + r["W_minus"]
        assert math.isclose(actual, expected, abs_tol=1e-6), (
            f"W⁺+W⁻ = {actual} ≠ n(n+1)/2 = {expected}"
        )

    def test_clearly_different_sequences_significant(self):
        """Deux séquences très différentes (n=15) doivent donner p < 0.05."""
        from picarones.core.statistics import wilcoxon_test
        a = [0.05] * 15          # moteur A très performant
        b = [0.60] * 15          # moteur B peu performant — toutes diff = −0.55
        # Diffs a−b = −0.55 pour tous → W⁺ = 0 → devrait être significatif
        r = wilcoxon_test(a, b)
        assert r["significant"] is True, f"p = {r['p_value']} — devrait être significatif"
        assert r["p_value"] < 0.05

    def test_large_n_normal_approximation_reasonable(self):
        """Pour n = 20, l'approximation normale doit donner une p-value dans [0, 1]."""
        from picarones.core.statistics import wilcoxon_test
        import random
        rng = random.Random(42)
        a = [rng.uniform(0.1, 0.5) for _ in range(20)]
        b = [x + rng.uniform(0.0, 0.1) for x in a]
        r = wilcoxon_test(a, b)
        assert 0.0 <= r["p_value"] <= 1.0
        assert r["n_pairs"] <= 20

    def test_small_n_returns_conservative_p(self):
        """Pour n < 10, la p-value doit être 0.04 (significatif) ou 0.20 (non sign.)."""
        from picarones.core.statistics import wilcoxon_test, _SCIPY_AVAILABLE
        if _SCIPY_AVAILABLE:
            pytest.skip("scipy disponible — la table exacte n'est pas utilisée")
        a = [0.1, 0.2, 0.3]
        b = [0.5, 0.6, 0.7]  # toutes diff = −0.4 → W = 0 → significatif
        r = wilcoxon_test(a, b)
        # Avec n=3, W=0 ≤ _W_CRITICAL[3]=0 → p=0.04
        assert r["p_value"] in (0.04, 0.20)

    def test_result_keys_complete(self):
        """Le dict retourné doit contenir toutes les clés documentées."""
        from picarones.core.statistics import wilcoxon_test
        r = wilcoxon_test([0.1, 0.3, 0.2, 0.4, 0.15, 0.35, 0.25, 0.5, 0.45, 0.05],
                          [0.2, 0.2, 0.3, 0.3, 0.25, 0.25, 0.35, 0.35, 0.40, 0.15])
        for key in ("statistic", "p_value", "significant", "interpretation", "n_pairs", "W_plus", "W_minus"):
            assert key in r, f"Clé manquante dans le résultat Wilcoxon : {key}"


# ===========================================================================
# Part 4 — Cohérence scipy / implémentation native
# ===========================================================================

class TestWilcoxonScipyIntegration:

    def test_scipy_available_flag_is_bool(self):
        """_SCIPY_AVAILABLE doit être un booléen."""
        from picarones.core.statistics import _SCIPY_AVAILABLE
        assert isinstance(_SCIPY_AVAILABLE, bool)

    def test_scipy_and_native_agree_on_significance(self):
        """Scipy et l'implémentation native doivent s'accorder sur la significativité."""
        from picarones.core.statistics import wilcoxon_test, _SCIPY_AVAILABLE
        if not _SCIPY_AVAILABLE:
            pytest.skip("scipy non disponible")

        # Cas avec différences claires et n suffisant pour que les deux méthodes convergent
        a = [0.05, 0.08, 0.06, 0.07, 0.04, 0.09, 0.05, 0.07, 0.06, 0.08,
             0.05, 0.07, 0.06, 0.08, 0.04]
        b = [0.30, 0.35, 0.28, 0.32, 0.31, 0.29, 0.34, 0.33, 0.30, 0.31,
             0.29, 0.32, 0.33, 0.30, 0.31]

        r = wilcoxon_test(a, b)
        # Avec scipy, résultat doit être significatif
        assert r["significant"] is True

    def test_scipy_p_value_in_valid_range(self):
        """La p-value fournie par scipy doit être dans [0, 1]."""
        from picarones.core.statistics import wilcoxon_test, _SCIPY_AVAILABLE
        if not _SCIPY_AVAILABLE:
            pytest.skip("scipy non disponible")

        a = [0.1 + i * 0.02 for i in range(12)]
        b = [0.1 + i * 0.01 for i in range(12)]
        r = wilcoxon_test(a, b)
        assert 0.0 <= r["p_value"] <= 1.0