Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

Picarones / tests /test_sprint13_parallelisation_stats.py

Claude

feat: corrections structurelles en 4 parties (Sprint 13)

b283975 unverified 26 days ago

24.2 kB

	"""Tests Sprint 13 — Corrections structurelles : parallelisation, exceptions, statistiques.

	Classes de tests
	----------------
	TestPyprojectCorrections (4 tests) — Part 1 : Beta, deps clarifiées
	TestEngineExecutionMode (5 tests) — Part 2 : execution_mode sur les classes moteur
	TestRunnerParallelParams (5 tests) — Part 3 : signature run_benchmark étendue
	TestRunnerTimeout (3 tests) — Part 3 : timeout par document
	TestRunnerPartialResults (4 tests) — Part 3 : sauvegarde / reprise partiels
	TestRunnerSilentExceptions (3 tests) — Part 2 : warnings au lieu de pass silencieux
	TestWilcoxonValidation (7 tests) — Part 4 : valeurs de référence connues
	TestWilcoxonScipyIntegration (3 tests) — Part 4 : cohérence scipy / natif
	"""

	from __future__ import annotations

	import inspect
	import json
	import math
	from pathlib import Path
	from unittest.mock import MagicMock, patch

	import pytest

	ROOT = Path(__file__).parent.parent


	# ===========================================================================
	# Fixtures
	# ===========================================================================

	@pytest.fixture
	def tmp_corpus(tmp_path):
	"""Corpus minimal de 3 documents pour les tests runner."""
	from PIL import Image
	for i in range(3):
	img = Image.new("RGB", (100, 30), color="white")
	img.save(tmp_path / f"doc{i:02d}.png")
	(tmp_path / f"doc{i:02d}.gt.txt").write_text(f"texte vérité {i}", encoding="utf-8")
	return tmp_path


	# ===========================================================================
	# Part 1 — Corrections pyproject.toml
	# ===========================================================================

	class TestPyprojectCorrections:

	def _read_pyproject(self) -> str:
	return (ROOT / "pyproject.toml").read_text(encoding="utf-8")

	def test_classifier_is_beta(self):
	"""Le classifier doit être 4 - Beta et non 5 - Production/Stable."""
	content = self._read_pyproject()
	assert "Development Status :: 4 - Beta" in content, (
	"pyproject.toml doit contenir 'Development Status :: 4 - Beta'"
	)
	assert "Production/Stable" not in content, (
	"pyproject.toml ne doit plus contenir 'Production/Stable'"
	)

	def test_fastapi_not_in_base_deps(self):
	"""fastapi ne doit pas être dans les dépendances de base."""
	import re
	content = self._read_pyproject()
	# Extraire la section dependencies = [...] sous [project] (avant la 1re section suivante)
	m = re.search(r"^dependencies\s=\s\[(.*?)\]", content, re.DOTALL \| re.MULTILINE)
	assert m, "Section dependencies introuvable dans pyproject.toml"
	base_deps = m.group(1)
	assert "fastapi" not in base_deps, (
	"fastapi ne doit pas être dans les dépendances de base — seulement dans [web]"
	)

	def test_httpx_not_in_base_deps(self):
	"""httpx ne doit pas être dans les dépendances de base."""
	import re
	content = self._read_pyproject()
	m = re.search(r"^dependencies\s=\s\[(.*?)\]", content, re.DOTALL \| re.MULTILINE)
	assert m
	base_deps = m.group(1)
	assert "httpx" not in base_deps, (
	"httpx ne doit pas être dans les dépendances de base — seulement dans [web]"
	)

	def test_web_extra_has_fastapi_httpx_multipart(self):
	"""L'extra [web] doit contenir fastapi, httpx et python-multipart."""
	import tomllib
	with (ROOT / "pyproject.toml").open("rb") as fh:
	data = tomllib.load(fh)
	web_deps = " ".join(data["project"]["optional-dependencies"]["web"])
	assert "fastapi" in web_deps
	assert "httpx" in web_deps
	assert "python-multipart" in web_deps


	# ===========================================================================
	# Part 2 — execution_mode sur les classes moteur
	# ===========================================================================

	class TestEngineExecutionMode:

	def test_base_engine_default_mode_is_io(self):
	"""BaseOCREngine doit avoir execution_mode = 'io' par défaut."""
	from picarones.engines.base import BaseOCREngine
	assert BaseOCREngine.execution_mode == "io"

	def test_tesseract_engine_mode_is_cpu(self):
	"""TesseractEngine doit avoir execution_mode = 'cpu'."""
	from picarones.engines.tesseract import TesseractEngine
	assert TesseractEngine.execution_mode == "cpu"

	def test_pero_engine_mode_is_cpu(self):
	"""PeroOCREngine doit avoir execution_mode = 'cpu'."""
	from picarones.engines.pero_ocr import PeroOCREngine
	assert PeroOCREngine.execution_mode == "cpu"

	def test_mistral_engine_default_mode_is_io(self):
	"""MistralOCREngine doit hériter execution_mode = 'io'."""
	from picarones.engines.mistral_ocr import MistralOCREngine
	assert MistralOCREngine.execution_mode == "io"

	def test_google_vision_default_mode_is_io(self):
	"""GoogleVisionEngine doit hériter execution_mode = 'io'."""
	from picarones.engines.google_vision import GoogleVisionEngine
	assert GoogleVisionEngine.execution_mode == "io"


	# ===========================================================================
	# Part 3 — Signature run_benchmark étendue
	# ===========================================================================

	class TestRunnerParallelParams:

	def test_max_workers_param_exists(self):
	"""run_benchmark doit accepter max_workers."""
	from picarones.core.runner import run_benchmark
	sig = inspect.signature(run_benchmark)
	assert "max_workers" in sig.parameters

	def test_max_workers_default_is_4(self):
	"""max_workers doit avoir 4 comme valeur par défaut."""
	from picarones.core.runner import run_benchmark
	sig = inspect.signature(run_benchmark)
	assert sig.parameters["max_workers"].default == 4

	def test_timeout_seconds_param_exists(self):
	"""run_benchmark doit accepter timeout_seconds."""
	from picarones.core.runner import run_benchmark
	sig = inspect.signature(run_benchmark)
	assert "timeout_seconds" in sig.parameters

	def test_timeout_seconds_default_is_60(self):
	"""timeout_seconds doit avoir 60.0 comme valeur par défaut."""
	from picarones.core.runner import run_benchmark
	sig = inspect.signature(run_benchmark)
	assert sig.parameters["timeout_seconds"].default == 60.0

	def test_partial_dir_param_exists(self):
	"""run_benchmark doit accepter partial_dir (None par défaut)."""
	from picarones.core.runner import run_benchmark
	sig = inspect.signature(run_benchmark)
	assert "partial_dir" in sig.parameters
	assert sig.parameters["partial_dir"].default is None


	# ===========================================================================
	# Part 3 — Timeout par document
	# ===========================================================================

	class TestRunnerTimeout:

	def test_timeout_doc_result_has_error(self, tmp_corpus):
	"""Un document ayant dépassé le timeout doit avoir engine_error contenant 'timeout'."""
	from picarones.core.corpus import load_corpus_from_directory
	from picarones.core.runner import run_benchmark
	from picarones.engines.base import BaseOCREngine
	import time

	class SlowEngine(BaseOCREngine):
	@property
	def name(self): return "slow_engine"
	def version(self): return "0.1"
	def _run_ocr(self, image_path):
	time.sleep(5) # 5 secondes — dépasse le timeout de 1s
	return "jamais atteint"

	corpus = load_corpus_from_directory(str(tmp_corpus))
	result = run_benchmark(
	corpus, [SlowEngine()],
	show_progress=False,
	timeout_seconds=1.0,
	max_workers=1,
	)
	assert len(result.engine_reports) == 1
	report = result.engine_reports[0]
	assert len(report.document_results) == len(corpus)
	# Au moins un document doit être marqué timeout
	timeout_docs = [dr for dr in report.document_results if dr.engine_error and "timeout" in dr.engine_error]
	assert len(timeout_docs) > 0, "Aucun document marqué timeout — le timeout ne fonctionne pas"

	def test_timeout_doc_result_cer_is_one(self, tmp_corpus):
	"""Un document timeout doit avoir CER = 1.0."""
	from picarones.core.corpus import load_corpus_from_directory
	from picarones.core.runner import run_benchmark
	from picarones.engines.base import BaseOCREngine
	import time

	class SlowEngine(BaseOCREngine):
	@property
	def name(self): return "slow"
	def version(self): return "0.1"
	def _run_ocr(self, image_path):
	time.sleep(5)
	return ""

	corpus = load_corpus_from_directory(str(tmp_corpus))
	result = run_benchmark(
	corpus, [SlowEngine()],
	show_progress=False,
	timeout_seconds=1.0,
	max_workers=1,
	)
	for dr in result.engine_reports[0].document_results:
	if dr.engine_error and "timeout" in dr.engine_error:
	assert dr.metrics.cer == 1.0

	def test_fast_docs_not_affected_by_timeout(self, tmp_corpus):
	"""Des documents rapides ne doivent pas être touchés par un timeout généreux."""
	from picarones.core.corpus import load_corpus_from_directory
	from picarones.core.runner import run_benchmark
	from picarones.engines.base import BaseOCREngine

	class FastEngine(BaseOCREngine):
	@property
	def name(self): return "fast"
	def version(self): return "0.1"
	def _run_ocr(self, image_path): return "texte ocr"

	corpus = load_corpus_from_directory(str(tmp_corpus))
	result = run_benchmark(
	corpus, [FastEngine()],
	show_progress=False,
	timeout_seconds=30.0,
	)
	timeout_docs = [
	dr for dr in result.engine_reports[0].document_results
	if dr.engine_error and "timeout" in dr.engine_error
	]
	assert len(timeout_docs) == 0, "Les documents rapides ne doivent pas être marqués timeout"


	# ===========================================================================
	# Part 3 — Résultats partiels (sauvegarde / reprise)
	# ===========================================================================

	class TestRunnerPartialResults:

	def test_partial_file_created_during_run(self, tmp_corpus, tmp_path):
	"""_save_partial_line doit être appelée pour chaque document traité."""
	from picarones.core.corpus import load_corpus_from_directory
	from picarones.core.runner import run_benchmark
	from picarones.engines.base import BaseOCREngine
	import picarones.core.runner as runner_mod

	save_calls: list[str] = []
	original_save = runner_mod._save_partial_line

	def tracking_save(path, doc_result):
	save_calls.append(doc_result.doc_id)
	original_save(path, doc_result)

	class MockEngine(BaseOCREngine):
	@property
	def name(self): return "mock"
	def version(self): return "0.1"
	def _run_ocr(self, image_path): return "texte"

	corpus = load_corpus_from_directory(str(tmp_corpus))
	with patch.object(runner_mod, "_save_partial_line", side_effect=tracking_save):
	run_benchmark(
	corpus, [MockEngine()],
	show_progress=False,
	partial_dir=str(tmp_path),
	)
	assert len(save_calls) == len(corpus), (
	f"_save_partial_line appelée {len(save_calls)} fois, attendu {len(corpus)}"
	)

	def test_partial_file_deleted_after_success(self, tmp_corpus, tmp_path):
	"""Le fichier .partial.json doit être supprimé après un benchmark réussi."""
	from picarones.core.corpus import load_corpus_from_directory
	from picarones.core.runner import run_benchmark
	from picarones.engines.base import BaseOCREngine

	class MockEngine(BaseOCREngine):
	@property
	def name(self): return "mock"
	def version(self): return "0.1"
	def _run_ocr(self, image_path): return "texte"

	corpus = load_corpus_from_directory(str(tmp_corpus))
	run_benchmark(
	corpus, [MockEngine()],
	show_progress=False,
	partial_dir=str(tmp_path),
	)
	partial_files = list(tmp_path.glob("*.partial.json"))
	assert len(partial_files) == 0, f"Fichier(s) partiel(s) non supprimé(s) : {partial_files}"

	def test_partial_load_skips_already_done_docs(self, tmp_corpus, tmp_path):
	"""La reprise depuis un fichier partiel doit sauter les documents déjà traités."""
	from picarones.core.corpus import load_corpus_from_directory
	from picarones.core.runner import _load_partial, _partial_path, _sanitize_filename

	corpus = load_corpus_from_directory(str(tmp_corpus))
	corpus_name = corpus.name
	engine_name = "mock_engine"

	# Créer un fichier partiel simulant 1 document déjà traité
	path = _partial_path(corpus_name, engine_name, tmp_path)
	doc = corpus.documents[0]
	partial_line = {
	"doc_id": doc.doc_id,
	"image_path": str(doc.image_path),
	"ground_truth": doc.ground_truth,
	"hypothesis": "déjà traité",
	"metrics": {"cer": 0.1, "cer_nfc": 0.1, "cer_caseless": 0.1,
	"wer": 0.1, "wer_normalized": 0.1, "mer": 0.1, "wil": 0.1,
	"reference_length": 10, "hypothesis_length": 10},
	"duration_seconds": 0.5,
	}
	path.write_text(json.dumps(partial_line) + "\n", encoding="utf-8")

	_, loaded = _load_partial(corpus_name, engine_name, tmp_path)
	assert len(loaded) == 1
	assert loaded[0].doc_id == doc.doc_id
	assert loaded[0].hypothesis == "déjà traité"

	def test_partial_load_returns_empty_for_missing_file(self, tmp_path):
	"""Si aucun fichier partiel n'existe, la liste doit être vide."""
	from picarones.core.runner import _load_partial
	_, loaded = _load_partial("corpus_inexistant", "moteur_inexistant", tmp_path)
	assert loaded == []


	# ===========================================================================
	# Part 2 — Exceptions non silencieuses dans le runner
	# ===========================================================================

	class TestRunnerSilentExceptions:

	def test_confusion_failure_logs_warning(self, tmp_corpus, caplog):
	"""Une erreur dans build_confusion_matrix doit être loguée, pas ignorée."""
	import logging
	from picarones.core.corpus import load_corpus_from_directory
	from picarones.core.runner import run_benchmark
	from picarones.engines.base import BaseOCREngine

	class MockEngine(BaseOCREngine):
	@property
	def name(self): return "mock"
	def version(self): return "0.1"
	def _run_ocr(self, image_path): return "texte ocr"

	corpus = load_corpus_from_directory(str(tmp_corpus))
	with patch(
	"picarones.core.runner._compute_document_result",
	wraps=__import__("picarones.core.runner", fromlist=["_compute_document_result"])._compute_document_result,
	):
	with patch("picarones.core.confusion.build_confusion_matrix", side_effect=RuntimeError("crash test")):
	with caplog.at_level(logging.WARNING):
	result = run_benchmark(corpus, [MockEngine()], show_progress=False)

	assert result is not None, "Le benchmark ne doit pas planter si la confusion matrix échoue"
	# La clé est que le benchmark se termine normalement
	assert len(result.engine_reports) == 1

	def test_progress_callback_failure_logs_warning(self, tmp_corpus, caplog):
	"""Une exception dans le progress_callback doit être loguée, pas propagée."""
	import logging
	from picarones.core.corpus import load_corpus_from_directory
	from picarones.core.runner import run_benchmark
	from picarones.engines.base import BaseOCREngine

	class MockEngine(BaseOCREngine):
	@property
	def name(self): return "mock"
	def version(self): return "0.1"
	def _run_ocr(self, image_path): return "texte"

	def bad_callback(engine_name, doc_idx, doc_id):
	raise ValueError("callback crash")

	corpus = load_corpus_from_directory(str(tmp_corpus))
	with caplog.at_level(logging.WARNING):
	result = run_benchmark(
	corpus, [MockEngine()],
	show_progress=False,
	progress_callback=bad_callback,
	)
	assert result is not None
	assert any("progress_callback" in r.message for r in caplog.records), (
	"L'exception du callback doit être loguée en WARNING"
	)

	def test_aggregate_helpers_log_on_failure(self, caplog):
	"""Les helpers _aggregate_* doivent logger en WARNING et retourner None."""
	import logging
	from picarones.core.runner import _aggregate_confusion

	# Créer un doc_result avec des données de confusion corrompues
	from picarones.core.results import DocumentResult
	from picarones.core.metrics import MetricsResult
	bad_dr = DocumentResult(
	doc_id="x", image_path="x.png", ground_truth="gt", hypothesis="hyp",
	metrics=MetricsResult(cer=0.1, cer_nfc=0.1, cer_caseless=0.1,
	wer=0.1, wer_normalized=0.1, mer=0.1, wil=0.1,
	reference_length=2, hypothesis_length=2),
	duration_seconds=0.1,
	confusion_matrix={"invalid_key": "corrupt_data"}, # va planter ConfusionMatrix(**...)
	)
	with caplog.at_level(logging.WARNING):
	result = _aggregate_confusion([bad_dr])
	assert result is None
	assert any("aggregate_confusion" in r.message for r in caplog.records)


	# ===========================================================================
	# Part 4 — Validation du test de Wilcoxon contre valeurs de référence
	# ===========================================================================

	class TestWilcoxonValidation:

	def test_identical_sequences_not_significant(self):
	"""Séquences identiques → pas de différence, p = 1.0, significant = False."""
	from picarones.core.statistics import wilcoxon_test
	a = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
	r = wilcoxon_test(a, a)
	assert r["significant"] is False
	assert r["p_value"] == 1.0
	assert r["n_pairs"] == 0

	def test_all_positive_diffs_w_minus_is_zero(self):
	"""Si toutes les différences a−b sont positives : W⁻ = 0, W⁺ = n(n+1)/2."""
	from picarones.core.statistics import wilcoxon_test
	n = 10
	a = [float(i) for i in range(1, n + 1)]
	b = [0.0] * n
	r = wilcoxon_test(a, b)
	expected_total = n * (n + 1) / 2.0
	assert math.isclose(r["W_minus"], 0.0, abs_tol=1e-9)
	assert math.isclose(r["W_plus"], expected_total, abs_tol=1e-9)

	def test_w_plus_w_minus_sum_invariant(self):
	"""W⁺ + W⁻ doit toujours être égal à n(n+1)/2 (n = nombre de paires non nulles)."""
	from picarones.core.statistics import wilcoxon_test
	a = [0.10, 0.25, 0.05, 0.40, 0.30, 0.15, 0.20, 0.35, 0.08, 0.18]
	b = [0.12, 0.20, 0.08, 0.35, 0.28, 0.18, 0.15, 0.40, 0.10, 0.20]
	r = wilcoxon_test(a, b)
	n = r["n_pairs"]
	expected = n * (n + 1) / 2.0
	actual = r["W_plus"] + r["W_minus"]
	assert math.isclose(actual, expected, abs_tol=1e-6), (
	f"W⁺+W⁻ = {actual} ≠ n(n+1)/2 = {expected}"
	)

	def test_clearly_different_sequences_significant(self):
	"""Deux séquences très différentes (n=15) doivent donner p < 0.05."""
	from picarones.core.statistics import wilcoxon_test
	a = [0.05] * 15 # moteur A très performant
	b = [0.60] * 15 # moteur B peu performant — toutes diff = −0.55
	# Diffs a−b = −0.55 pour tous → W⁺ = 0 → devrait être significatif
	r = wilcoxon_test(a, b)
	assert r["significant"] is True, f"p = {r['p_value']} — devrait être significatif"
	assert r["p_value"] < 0.05

	def test_large_n_normal_approximation_reasonable(self):
	"""Pour n = 20, l'approximation normale doit donner une p-value dans [0, 1]."""
	from picarones.core.statistics import wilcoxon_test
	import random
	rng = random.Random(42)
	a = [rng.uniform(0.1, 0.5) for _ in range(20)]
	b = [x + rng.uniform(0.0, 0.1) for x in a]
	r = wilcoxon_test(a, b)
	assert 0.0 <= r["p_value"] <= 1.0
	assert r["n_pairs"] <= 20

	def test_small_n_returns_conservative_p(self):
	"""Pour n < 10, la p-value doit être 0.04 (significatif) ou 0.20 (non sign.)."""
	from picarones.core.statistics import wilcoxon_test, _SCIPY_AVAILABLE
	if _SCIPY_AVAILABLE:
	pytest.skip("scipy disponible — la table exacte n'est pas utilisée")
	a = [0.1, 0.2, 0.3]
	b = [0.5, 0.6, 0.7] # toutes diff = −0.4 → W = 0 → significatif
	r = wilcoxon_test(a, b)
	# Avec n=3, W=0 ≤ _W_CRITICAL[3]=0 → p=0.04
	assert r["p_value"] in (0.04, 0.20)

	def test_result_keys_complete(self):
	"""Le dict retourné doit contenir toutes les clés documentées."""
	from picarones.core.statistics import wilcoxon_test
	r = wilcoxon_test([0.1, 0.3, 0.2, 0.4, 0.15, 0.35, 0.25, 0.5, 0.45, 0.05],
	[0.2, 0.2, 0.3, 0.3, 0.25, 0.25, 0.35, 0.35, 0.40, 0.15])
	for key in ("statistic", "p_value", "significant", "interpretation", "n_pairs", "W_plus", "W_minus"):
	assert key in r, f"Clé manquante dans le résultat Wilcoxon : {key}"


	# ===========================================================================
	# Part 4 — Cohérence scipy / implémentation native
	# ===========================================================================

	class TestWilcoxonScipyIntegration:

	def test_scipy_available_flag_is_bool(self):
	"""_SCIPY_AVAILABLE doit être un booléen."""
	from picarones.core.statistics import _SCIPY_AVAILABLE
	assert isinstance(_SCIPY_AVAILABLE, bool)

	def test_scipy_and_native_agree_on_significance(self):
	"""Scipy et l'implémentation native doivent s'accorder sur la significativité."""
	from picarones.core.statistics import wilcoxon_test, _SCIPY_AVAILABLE, _native_p_value
	if not _SCIPY_AVAILABLE:
	pytest.skip("scipy non disponible")

	# Cas avec différences claires et n suffisant pour que les deux méthodes convergent
	a = [0.05, 0.08, 0.06, 0.07, 0.04, 0.09, 0.05, 0.07, 0.06, 0.08,
	0.05, 0.07, 0.06, 0.08, 0.04]
	b = [0.30, 0.35, 0.28, 0.32, 0.31, 0.29, 0.34, 0.33, 0.30, 0.31,
	0.29, 0.32, 0.33, 0.30, 0.31]

	r = wilcoxon_test(a, b)
	# Avec scipy, résultat doit être significatif
	assert r["significant"] is True

	def test_scipy_p_value_in_valid_range(self):
	"""La p-value fournie par scipy doit être dans [0, 1]."""
	from picarones.core.statistics import wilcoxon_test, _SCIPY_AVAILABLE
	if not _SCIPY_AVAILABLE:
	pytest.skip("scipy non disponible")

	a = [0.1 + i * 0.02 for i in range(12)]
	b = [0.1 + i * 0.01 for i in range(12)]
	r = wilcoxon_test(a, b)
	assert 0.0 <= r["p_value"] <= 1.0