Spaces:
Sleeping
Sleeping
Claude
fix: résoudre les 64 erreurs ruff pré-existantes révélées par le lint actif
6362212 unverified | """Tests Sprint 4 : normalisation diplomatique, import IIIF, adaptateurs API OCR.""" | |
| from __future__ import annotations | |
| import pytest | |
| from picarones.core.normalization import ( | |
| NormalizationProfile, | |
| DEFAULT_DIPLOMATIC_PROFILE, | |
| _apply_diplomatic_table, | |
| get_builtin_profile, | |
| ) | |
| from picarones.core.metrics import compute_metrics, aggregate_metrics, MetricsResult | |
| from picarones.importers.iiif import ( | |
| IIIFManifestParser, | |
| parse_page_selector, | |
| _extract_label, | |
| _best_image_url_v2, | |
| _best_image_url_v3, | |
| _guess_extension, | |
| _slugify, | |
| ) | |
| # =========================================================================== | |
| # Tests NormalizationProfile | |
| # =========================================================================== | |
| class TestNormalizationProfile: | |
| def test_default_nfc_only(self): | |
| profile = NormalizationProfile(name="test") | |
| assert profile.nfc is True | |
| assert profile.caseless is False | |
| assert profile.diplomatic_table == {} | |
| def test_normalize_nfc(self): | |
| profile = NormalizationProfile(name="nfc_only") | |
| # NFD vs NFC : après NFC, les deux doivent être identiques | |
| decomposed = "e\u0301" # e + accent | |
| assert profile.normalize(decomposed) == "\u00e9" # é NFC | |
| def test_normalize_caseless(self): | |
| profile = NormalizationProfile(name="caseless", caseless=True) | |
| assert profile.normalize("Bonjour MONDE") == "bonjour monde" | |
| def test_normalize_diplomatic_table(self): | |
| profile = NormalizationProfile( | |
| name="test", | |
| diplomatic_table={"ſ": "s", "u": "v"} | |
| ) | |
| # "maiſon": ſ→s gives "maison", no u present → "maison" | |
| assert profile.normalize("maiſon") == "maison" | |
| # "uers" (vers ancien): u→v gives "vers" | |
| assert profile.normalize("uers") == "vers" | |
| def test_normalize_order_nfc_then_caseless_then_diplomatic(self): | |
| """L'ordre est : NFC → caseless → table diplomatique.""" | |
| profile = NormalizationProfile( | |
| name="combined", | |
| caseless=True, | |
| diplomatic_table={"ſ": "s"} | |
| ) | |
| result = profile.normalize("Maiſon") | |
| assert result == "maison" | |
| def test_as_dict(self): | |
| profile = NormalizationProfile( | |
| name="medieval_french", | |
| nfc=True, | |
| caseless=False, | |
| diplomatic_table={"ſ": "s"}, | |
| description="Test", | |
| ) | |
| d = profile.as_dict() | |
| assert d["name"] == "medieval_french" | |
| assert d["diplomatic_table"] == {"ſ": "s"} | |
| assert d["caseless"] is False | |
| def test_from_dict(self): | |
| data = { | |
| "name": "custom", | |
| "caseless": True, | |
| "diplomatic": {"ſ": "s", "u": "v"}, | |
| "description": "Custom profile", | |
| } | |
| profile = NormalizationProfile.from_dict(data) | |
| assert profile.name == "custom" | |
| assert profile.caseless is True | |
| assert profile.diplomatic_table == {"ſ": "s", "u": "v"} | |
| def test_from_dict_defaults(self): | |
| profile = NormalizationProfile.from_dict({}) | |
| assert profile.name == "custom" | |
| assert profile.nfc is True | |
| assert profile.caseless is False | |
| def test_from_yaml(self, tmp_path): | |
| yaml_content = "name: my_profile\ncaseless: false\ndiplomatic:\n \u017f: s\n u: v\n" | |
| yaml_file = tmp_path / "profile.yaml" | |
| yaml_file.write_text(yaml_content, encoding="utf-8") | |
| try: | |
| profile = NormalizationProfile.from_yaml(yaml_file) | |
| assert profile.name == "my_profile" | |
| assert profile.diplomatic_table == {"\u017f": "s", "u": "v"} | |
| except RuntimeError as e: | |
| if "pyyaml" in str(e): | |
| pytest.skip("pyyaml non installé") | |
| raise | |
| class TestApplyDiplomaticTable: | |
| def test_simple_substitutions(self): | |
| table = {"ſ": "s", "u": "v"} | |
| # "maiſon": ſ→s gives "maison"; no u → "maison" | |
| assert _apply_diplomatic_table("maiſon", table) == "maison" | |
| # "uers": u→v gives "vers" | |
| assert _apply_diplomatic_table("uers", table) == "vers" | |
| def test_multi_char_key_priority(self): | |
| """Les clés multi-chars sont appliquées avant les clés simples.""" | |
| table = {"ae": "X", "a": "Y"} | |
| # "ae" doit être remplacé en "X" et non "Ye" | |
| result = _apply_diplomatic_table("aeb", table) | |
| assert result == "Xb" | |
| def test_ampersand_to_et(self): | |
| table = {"&": "et"} | |
| assert _apply_diplomatic_table("noir & blanc", table) == "noir et blanc" | |
| def test_empty_table(self): | |
| assert _apply_diplomatic_table("hello", {}) == "hello" | |
| def test_empty_text(self): | |
| assert _apply_diplomatic_table("", {"a": "b"}) == "" | |
| class TestGetBuiltinProfile: | |
| def test_medieval_french(self): | |
| profile = get_builtin_profile("medieval_french") | |
| assert profile.name == "medieval_french" | |
| assert "ſ" in profile.diplomatic_table | |
| assert profile.diplomatic_table["ſ"] == "s" | |
| def test_early_modern_french(self): | |
| profile = get_builtin_profile("early_modern_french") | |
| assert "ſ" in profile.diplomatic_table | |
| def test_medieval_latin(self): | |
| profile = get_builtin_profile("medieval_latin") | |
| assert "ꝑ" in profile.diplomatic_table | |
| def test_minimal(self): | |
| profile = get_builtin_profile("minimal") | |
| assert "ſ" in profile.diplomatic_table | |
| assert "u" not in profile.diplomatic_table | |
| def test_nfc(self): | |
| profile = get_builtin_profile("nfc") | |
| assert profile.nfc is True | |
| assert profile.diplomatic_table == {} | |
| def test_caseless(self): | |
| profile = get_builtin_profile("caseless") | |
| assert profile.caseless is True | |
| def test_unknown_raises_key_error(self): | |
| with pytest.raises(KeyError, match="inexistant"): | |
| get_builtin_profile("inexistant") | |
| def test_default_profile_is_medieval_french(self): | |
| assert DEFAULT_DIPLOMATIC_PROFILE.name == "medieval_french" | |
| # =========================================================================== | |
| # Tests CER diplomatique dans compute_metrics | |
| # =========================================================================== | |
| class TestDiplomaticCER: | |
| def test_cer_diplomatic_computed_by_default(self): | |
| """Le CER diplomatique est calculé par défaut avec le profil médiéval.""" | |
| result = compute_metrics("maiſon", "maison") | |
| assert result.cer_diplomatic is not None | |
| assert result.diplomatic_profile_name == "medieval_french" | |
| def test_cer_diplomatic_lower_than_exact_for_long_s(self): | |
| """ | |
| Avec ſ→s : le CER diplomatique doit être 0.0 pour "maiſon" vs "maison" | |
| car après normalisation les deux deviennent "maivon" ou "maison". | |
| """ | |
| # "maiſon" vs "maison" — différence uniquement sur ſ vs s | |
| result = compute_metrics("maiſon", "maison") | |
| # CER brut > 0 (ſ ≠ s, deux bytes UTF-8 vs un) | |
| assert result.cer > 0.0 | |
| # CER diplomatique = 0 car ſ et s sont équivalents dans le profil médiéval | |
| assert result.cer_diplomatic == pytest.approx(0.0) | |
| def test_cer_diplomatic_in_as_dict(self): | |
| result = compute_metrics("maiſon", "maison") | |
| d = result.as_dict() | |
| assert "cer_diplomatic" in d | |
| assert "diplomatic_profile_name" in d | |
| def test_cer_diplomatic_with_custom_profile(self): | |
| from picarones.core.normalization import NormalizationProfile | |
| profile = NormalizationProfile( | |
| name="test_profile", | |
| diplomatic_table={"ſ": "s"} | |
| ) | |
| result = compute_metrics("maiſon", "maison", normalization_profile=profile) | |
| assert result.cer_diplomatic == pytest.approx(0.0) | |
| assert result.diplomatic_profile_name == "test_profile" | |
| def test_cer_diplomatic_not_in_as_dict_when_none(self): | |
| """Si le CER diplomatique n'a pas pu être calculé, il n'est pas dans as_dict.""" | |
| result = MetricsResult( | |
| cer=0.1, cer_nfc=0.1, cer_caseless=0.1, | |
| wer=0.1, wer_normalized=0.1, mer=0.1, wil=0.1, | |
| reference_length=10, hypothesis_length=10, | |
| cer_diplomatic=None, diplomatic_profile_name=None, | |
| ) | |
| d = result.as_dict() | |
| assert "cer_diplomatic" not in d | |
| def test_aggregate_metrics_includes_diplomatic_cer(self): | |
| """aggregate_metrics doit agréger cer_diplomatic quand disponible.""" | |
| results = [ | |
| MetricsResult( | |
| cer=0.1, cer_nfc=0.1, cer_caseless=0.1, | |
| wer=0.1, wer_normalized=0.1, mer=0.1, wil=0.1, | |
| reference_length=10, hypothesis_length=10, | |
| cer_diplomatic=0.05, diplomatic_profile_name="medieval_french", | |
| ), | |
| MetricsResult( | |
| cer=0.2, cer_nfc=0.2, cer_caseless=0.2, | |
| wer=0.2, wer_normalized=0.2, mer=0.2, wil=0.2, | |
| reference_length=10, hypothesis_length=10, | |
| cer_diplomatic=0.10, diplomatic_profile_name="medieval_french", | |
| ), | |
| ] | |
| agg = aggregate_metrics(results) | |
| assert "cer_diplomatic" in agg | |
| assert agg["cer_diplomatic"]["mean"] == pytest.approx(0.075) | |
| assert agg["cer_diplomatic"].get("profile") == "medieval_french" | |
| # =========================================================================== | |
| # Tests parse_page_selector | |
| # =========================================================================== | |
| class TestParsePageSelector: | |
| def test_all(self): | |
| assert parse_page_selector("all", 10) == list(range(10)) | |
| def test_empty_string(self): | |
| assert parse_page_selector("", 5) == list(range(5)) | |
| def test_single_page(self): | |
| assert parse_page_selector("3", 10) == [2] # 0-based | |
| def test_range(self): | |
| assert parse_page_selector("1-5", 10) == [0, 1, 2, 3, 4] | |
| def test_comma_list(self): | |
| assert parse_page_selector("1,3,5", 10) == [0, 2, 4] | |
| def test_combined(self): | |
| result = parse_page_selector("1-3,5,8-9", 10) | |
| assert result == [0, 1, 2, 4, 7, 8] | |
| def test_deduplication(self): | |
| result = parse_page_selector("1,1,2", 5) | |
| assert result == [0, 1] | |
| def test_sorted_output(self): | |
| result = parse_page_selector("5,1,3", 10) | |
| assert result == [0, 2, 4] | |
| def test_page_out_of_range_raises(self): | |
| with pytest.raises(ValueError): | |
| parse_page_selector("15", 10) | |
| def test_range_out_of_bounds_raises(self): | |
| with pytest.raises(ValueError): | |
| parse_page_selector("1-15", 10) | |
| def test_invalid_syntax_raises(self): | |
| with pytest.raises((ValueError, Exception)): | |
| parse_page_selector("abc", 10) | |
| def test_last_page(self): | |
| assert parse_page_selector("10", 10) == [9] | |
| def test_first_page(self): | |
| assert parse_page_selector("1", 10) == [0] | |
| # =========================================================================== | |
| # Tests IIIFManifestParser — IIIF v2 | |
| # =========================================================================== | |
| def _make_v2_manifest(num_canvases: int = 3, with_service: bool = False) -> dict: | |
| """Fabrique un manifeste IIIF v2 minimal de test.""" | |
| canvases = [] | |
| for i in range(num_canvases): | |
| resource: dict | |
| if with_service: | |
| resource = { | |
| "@type": "dctypes:Image", | |
| "service": {"@id": f"https://example.com/iiif/img{i+1}"}, | |
| } | |
| else: | |
| resource = { | |
| "@type": "dctypes:Image", | |
| "@id": f"https://example.com/images/img{i+1}.jpg", | |
| } | |
| canvases.append({ | |
| "@id": f"https://example.com/canvas/{i+1}", | |
| "@type": "sc:Canvas", | |
| "label": f"f. {i+1}r", | |
| "width": 2000, | |
| "height": 3000, | |
| "images": [ | |
| { | |
| "@type": "oa:Annotation", | |
| "motivation": "sc:painting", | |
| "resource": resource, | |
| "on": f"https://example.com/canvas/{i+1}", | |
| } | |
| ], | |
| }) | |
| return { | |
| "@context": "http://iiif.io/api/presentation/2/context.json", | |
| "@type": "sc:Manifest", | |
| "@id": "https://example.com/manifest.json", | |
| "label": "Manuscript de test", | |
| "sequences": [ | |
| { | |
| "@type": "sc:Sequence", | |
| "canvases": canvases, | |
| } | |
| ], | |
| } | |
| def _make_v3_manifest(num_canvases: int = 3) -> dict: | |
| """Fabrique un manifeste IIIF v3 minimal de test.""" | |
| items = [] | |
| for i in range(num_canvases): | |
| items.append({ | |
| "id": f"https://example.com/canvas/{i+1}", | |
| "type": "Canvas", | |
| "label": {"fr": [f"Page {i+1}"]}, | |
| "width": 1500, | |
| "height": 2200, | |
| "items": [ | |
| { | |
| "id": f"https://example.com/canvas/{i+1}/ap", | |
| "type": "AnnotationPage", | |
| "items": [ | |
| { | |
| "id": f"https://example.com/canvas/{i+1}/ap/a", | |
| "type": "Annotation", | |
| "motivation": "painting", | |
| "body": { | |
| "id": f"https://example.com/images/{i+1}/full/max/0/default.jpg", | |
| "type": "Image", | |
| "format": "image/jpeg", | |
| }, | |
| "target": f"https://example.com/canvas/{i+1}", | |
| } | |
| ], | |
| } | |
| ], | |
| }) | |
| return { | |
| "@context": "http://iiif.io/api/presentation/3/context.json", | |
| "id": "https://example.com/manifest.json", | |
| "type": "Manifest", | |
| "label": {"fr": ["Manuscrit v3 de test"]}, | |
| "items": items, | |
| } | |
| class TestIIIFManifestParserV2: | |
| def test_version_detection(self): | |
| manifest = _make_v2_manifest() | |
| parser = IIIFManifestParser(manifest) | |
| assert parser.version == 2 | |
| def test_canvases_count(self): | |
| parser = IIIFManifestParser(_make_v2_manifest(5)) | |
| assert len(parser.canvases()) == 5 | |
| def test_canvas_label(self): | |
| parser = IIIFManifestParser(_make_v2_manifest()) | |
| canvases = parser.canvases() | |
| assert canvases[0].label == "f. 1r" | |
| assert canvases[1].label == "f. 2r" | |
| def test_canvas_image_url_direct(self): | |
| parser = IIIFManifestParser(_make_v2_manifest()) | |
| canvases = parser.canvases() | |
| assert canvases[0].image_url == "https://example.com/images/img1.jpg" | |
| def test_canvas_image_url_via_service(self): | |
| parser = IIIFManifestParser(_make_v2_manifest(with_service=True)) | |
| canvases = parser.canvases() | |
| assert "/full/max/0/default.jpg" in canvases[0].image_url | |
| def test_canvas_dimensions(self): | |
| parser = IIIFManifestParser(_make_v2_manifest()) | |
| c = parser.canvases()[0] | |
| assert c.width == 2000 | |
| assert c.height == 3000 | |
| def test_canvas_index(self): | |
| parser = IIIFManifestParser(_make_v2_manifest(3)) | |
| canvases = parser.canvases() | |
| for i, c in enumerate(canvases): | |
| assert c.index == i | |
| def test_label(self): | |
| parser = IIIFManifestParser(_make_v2_manifest()) | |
| assert parser.label == "Manuscript de test" | |
| def test_empty_sequences(self): | |
| manifest = { | |
| "@context": "http://iiif.io/api/presentation/2/context.json", | |
| "@type": "sc:Manifest", | |
| "label": "Empty", | |
| "sequences": [], | |
| } | |
| parser = IIIFManifestParser(manifest) | |
| assert parser.canvases() == [] | |
| class TestIIIFManifestParserV3: | |
| def test_version_detection(self): | |
| manifest = _make_v3_manifest() | |
| parser = IIIFManifestParser(manifest) | |
| assert parser.version == 3 | |
| def test_canvases_count(self): | |
| parser = IIIFManifestParser(_make_v3_manifest(4)) | |
| assert len(parser.canvases()) == 4 | |
| def test_canvas_label_from_language_map(self): | |
| parser = IIIFManifestParser(_make_v3_manifest()) | |
| canvases = parser.canvases() | |
| assert "Page 1" in canvases[0].label | |
| def test_canvas_image_url(self): | |
| parser = IIIFManifestParser(_make_v3_manifest()) | |
| canvases = parser.canvases() | |
| assert "default.jpg" in canvases[0].image_url | |
| def test_manifest_label_language_map(self): | |
| parser = IIIFManifestParser(_make_v3_manifest()) | |
| assert "v3" in parser.label.lower() or "test" in parser.label.lower() | |
| def test_type_manifest_triggers_v3(self): | |
| """Un manifeste avec type == 'Manifest' est détecté comme v3.""" | |
| manifest = {"type": "Manifest", "items": []} | |
| parser = IIIFManifestParser(manifest) | |
| assert parser.version == 3 | |
| class TestExtractLabel: | |
| def test_string(self): | |
| assert _extract_label("Page 1") == "Page 1" | |
| def test_list(self): | |
| assert _extract_label(["Page 1", "Page 2"]) == "Page 1" | |
| def test_dict_fr(self): | |
| assert _extract_label({"fr": ["Folio 1r"]}) == "Folio 1r" | |
| def test_dict_en(self): | |
| assert _extract_label({"en": ["Folio 1r"]}) == "Folio 1r" | |
| def test_dict_none_key(self): | |
| assert _extract_label({"none": ["Label"]}) == "Label" | |
| def test_empty_string(self): | |
| assert _extract_label("") == "" | |
| def test_none_value(self): | |
| result = _extract_label(None) | |
| assert isinstance(result, str) | |
| class TestBestImageUrlV2: | |
| def test_direct_id(self): | |
| resource = {"@id": "https://example.com/img.jpg"} | |
| url = _best_image_url_v2(resource, {}) | |
| assert url == "https://example.com/img.jpg" | |
| def test_service_id(self): | |
| resource = { | |
| "@id": "https://example.com/info.json", | |
| "service": {"@id": "https://example.com/iiif/img1"}, | |
| } | |
| url = _best_image_url_v2(resource, {}) | |
| assert url == "https://example.com/iiif/img1/full/max/0/default.jpg" | |
| def test_service_list(self): | |
| resource = { | |
| "service": [ | |
| {"@id": "https://example.com/iiif/img2"}, | |
| ] | |
| } | |
| url = _best_image_url_v2(resource, {}) | |
| assert url == "https://example.com/iiif/img2/full/max/0/default.jpg" | |
| class TestBestImageUrlV3: | |
| def test_direct_body_image(self): | |
| canvas = { | |
| "items": [ | |
| { | |
| "type": "AnnotationPage", | |
| "items": [ | |
| { | |
| "type": "Annotation", | |
| "motivation": "painting", | |
| "body": { | |
| "id": "https://example.com/img.jpg", | |
| "type": "Image", | |
| }, | |
| } | |
| ], | |
| } | |
| ] | |
| } | |
| url = _best_image_url_v3(canvas) | |
| assert url == "https://example.com/img.jpg" | |
| def test_body_via_service(self): | |
| canvas = { | |
| "items": [ | |
| { | |
| "items": [ | |
| { | |
| "body": { | |
| "type": "Image", | |
| "id": "", | |
| "service": [{"id": "https://example.com/iiif/3/img1"}], | |
| } | |
| } | |
| ] | |
| } | |
| ] | |
| } | |
| url = _best_image_url_v3(canvas) | |
| assert "/full/max/0/default.jpg" in url | |
| def test_empty_canvas(self): | |
| url = _best_image_url_v3({}) | |
| assert url == "" | |
| class TestGuessExtension: | |
| def test_jpg(self): | |
| assert _guess_extension("https://example.com/img.jpg") == ".jpg" | |
| def test_png(self): | |
| assert _guess_extension("https://example.com/img.png") == ".png" | |
| def test_tiff(self): | |
| assert _guess_extension("https://example.com/img.tiff") == ".tiff" | |
| def test_iiif_default(self): | |
| # URL IIIF standard contient /default.jpg | |
| url = "https://example.com/iiif/img/full/max/0/default.jpg" | |
| assert _guess_extension(url) == ".jpg" | |
| def test_unknown_defaults_to_jpg(self): | |
| assert _guess_extension("https://example.com/resource/123") == ".jpg" | |
| class TestSlugify: | |
| def test_simple(self): | |
| assert _slugify("Page 1") == "Page_1" | |
| def test_special_chars_removed(self): | |
| result = _slugify("f. 1r (recto)") | |
| assert "/" not in result | |
| assert "." not in result | |
| def test_max_length(self): | |
| long_label = "x" * 100 | |
| assert len(_slugify(long_label)) <= 60 | |
| def test_empty(self): | |
| assert _slugify("") == "" | |
| # =========================================================================== | |
| # Tests structure des nouveaux moteurs OCR (sans appel réseau) | |
| # =========================================================================== | |
| class TestMistralOCREngine: | |
| def test_import(self): | |
| from picarones.engines.mistral_ocr import MistralOCREngine | |
| assert MistralOCREngine is not None | |
| def test_name(self): | |
| from picarones.engines.mistral_ocr import MistralOCREngine | |
| engine = MistralOCREngine() | |
| assert engine.name == "mistral_ocr" | |
| def test_version_default_model(self): | |
| from picarones.engines.mistral_ocr import MistralOCREngine | |
| engine = MistralOCREngine() | |
| # Le modèle par défaut est désormais mistral-ocr-latest (API OCR native) | |
| assert "mistral-ocr" in engine.version() | |
| def test_version_custom_model(self): | |
| from picarones.engines.mistral_ocr import MistralOCREngine | |
| engine = MistralOCREngine({"model": "pixtral-large-latest"}) | |
| assert engine.version() == "pixtral-large-latest" | |
| def test_missing_api_key_raises(self, monkeypatch, tmp_path): | |
| from picarones.engines.mistral_ocr import MistralOCREngine | |
| monkeypatch.delenv("MISTRAL_API_KEY", raising=False) | |
| engine = MistralOCREngine() | |
| # Créer un fichier image factice | |
| img = tmp_path / "test.jpg" | |
| img.write_bytes(b"\xff\xd8\xff") # JPEG header minimal | |
| with pytest.raises(RuntimeError, match="MISTRAL_API_KEY"): | |
| engine._run_ocr(img) | |
| def test_exported_from_engines(self): | |
| from picarones.engines import MistralOCREngine | |
| assert MistralOCREngine is not None | |
| class TestGoogleVisionEngine: | |
| def test_import(self): | |
| from picarones.engines.google_vision import GoogleVisionEngine | |
| assert GoogleVisionEngine is not None | |
| def test_name(self): | |
| from picarones.engines.google_vision import GoogleVisionEngine | |
| engine = GoogleVisionEngine() | |
| assert engine.name == "google_vision" | |
| def test_version(self): | |
| from picarones.engines.google_vision import GoogleVisionEngine | |
| engine = GoogleVisionEngine() | |
| assert engine.version() == "v1" | |
| def test_missing_credentials_raises(self, monkeypatch, tmp_path): | |
| from picarones.engines.google_vision import GoogleVisionEngine | |
| monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False) | |
| monkeypatch.delenv("GOOGLE_API_KEY", raising=False) | |
| engine = GoogleVisionEngine() | |
| img = tmp_path / "test.jpg" | |
| img.write_bytes(b"\xff\xd8\xff") | |
| with pytest.raises(RuntimeError): | |
| engine._run_ocr(img) | |
| def test_exported_from_engines(self): | |
| from picarones.engines import GoogleVisionEngine | |
| assert GoogleVisionEngine is not None | |
| class TestAzureDocIntelEngine: | |
| def test_import(self): | |
| from picarones.engines.azure_doc_intel import AzureDocIntelEngine | |
| assert AzureDocIntelEngine is not None | |
| def test_name(self): | |
| from picarones.engines.azure_doc_intel import AzureDocIntelEngine | |
| engine = AzureDocIntelEngine() | |
| assert engine.name == "azure_doc_intel" | |
| def test_missing_key_raises(self, monkeypatch, tmp_path): | |
| from picarones.engines.azure_doc_intel import AzureDocIntelEngine | |
| monkeypatch.delenv("AZURE_DOC_INTEL_KEY", raising=False) | |
| monkeypatch.delenv("AZURE_DOC_INTEL_ENDPOINT", raising=False) | |
| engine = AzureDocIntelEngine() | |
| img = tmp_path / "test.jpg" | |
| img.write_bytes(b"\xff\xd8\xff") | |
| with pytest.raises(RuntimeError): | |
| engine._run_ocr(img) | |
| def test_exported_from_engines(self): | |
| from picarones.engines import AzureDocIntelEngine | |
| assert AzureDocIntelEngine is not None | |
| # =========================================================================== | |
| # Tests CLI — commande import iiif | |
| # =========================================================================== | |
| class TestCLIImportIIIF: | |
| def test_import_group_exists(self): | |
| from picarones.cli import cli | |
| from click.testing import CliRunner | |
| runner = CliRunner() | |
| result = runner.invoke(cli, ["import", "--help"]) | |
| assert result.exit_code == 0 | |
| def test_import_iiif_command_exists(self): | |
| from picarones.cli import cli | |
| from click.testing import CliRunner | |
| runner = CliRunner() | |
| result = runner.invoke(cli, ["import", "iiif", "--help"]) | |
| assert result.exit_code == 0 | |
| assert "manifest_url" in result.output.lower() or "MANIFEST_URL" in result.output | |
| def test_import_iiif_options(self): | |
| from picarones.cli import cli | |
| from click.testing import CliRunner | |
| runner = CliRunner() | |
| result = runner.invoke(cli, ["import", "iiif", "--help"]) | |
| assert "--pages" in result.output | |
| assert "--output" in result.output | |
| def test_import_iiif_requires_url(self): | |
| from picarones.cli import cli | |
| from click.testing import CliRunner | |
| runner = CliRunner() | |
| result = runner.invoke(cli, ["import", "iiif"]) | |
| # Sans URL, doit afficher une erreur | |
| assert result.exit_code != 0 | |
| # =========================================================================== | |
| # Tests fixtures Sprint 4 (CER diplomatique dans la démo) | |
| # =========================================================================== | |
| class TestFixturesDiplomaticCER: | |
| def test_gt_texts_contain_medieval_graphies(self): | |
| """Les textes GT de démo doivent contenir des graphies médiévales.""" | |
| from picarones.fixtures import _GT_TEXTS | |
| all_gt = " ".join(_GT_TEXTS) | |
| # Les GT doivent contenir au moins ſ, & ou æ/œ | |
| has_medieval_chars = any(c in all_gt for c in ["ſ", "&", "æ", "œ"]) | |
| assert has_medieval_chars, "Les GT de démo doivent inclure des graphies médiévales pour illustrer le CER diplomatique" | |
| def test_benchmark_results_have_diplomatic_cer(self): | |
| """Les résultats du benchmark fictif doivent inclure le CER diplomatique.""" | |
| from picarones.fixtures import generate_sample_benchmark | |
| bm = generate_sample_benchmark() | |
| for engine_report in bm.engine_reports: | |
| for doc_result in engine_report.document_results: | |
| if doc_result.metrics.error is None: | |
| # Le CER diplomatique doit être calculé | |
| assert doc_result.metrics.cer_diplomatic is not None, ( | |
| f"CER diplomatique manquant pour {engine_report.engine_name}" | |
| ) | |
| break # Un seul doc suffit pour vérifier | |
| def test_diplomatic_cer_lower_for_medieval_graphies(self): | |
| """Pour un texte avec ſ, le CER diplomatique doit être ≤ CER exact.""" | |
| result = compute_metrics( | |
| "maiſon & jardin", # GT avec graphies médiévales | |
| "maison et jardin", # OCR avec graphies modernisées | |
| ) | |
| assert result.cer_diplomatic is not None | |
| # CER diplomatique doit être inférieur ou égal au CER exact | |
| assert result.cer_diplomatic <= result.cer | |
| # =========================================================================== | |
| # Tests rapport HTML Sprint 4 (CER diplomatique affiché) | |
| # =========================================================================== | |
| class TestReportDiplomaticCER: | |
| def test_report_data_has_cer_diplomatic(self): | |
| """_build_report_data doit inclure cer_diplomatic dans engines_summary.""" | |
| from picarones.fixtures import generate_sample_benchmark | |
| from picarones.report.generator import _build_report_data | |
| bm = generate_sample_benchmark() | |
| data = _build_report_data(bm, images_b64={}) | |
| # Chaque entrée engines doit avoir cer_diplomatic (ou None) | |
| assert "engines" in data | |
| for engine_data in data["engines"]: | |
| assert "cer_diplomatic" in engine_data, ( | |
| f"cer_diplomatic manquant dans {engine_data.get('name', '?')}" | |
| ) | |
| def test_html_contains_cer_diplo_column(self, tmp_path): | |
| """Le HTML généré doit contenir la colonne CER diplo.""" | |
| from picarones.fixtures import generate_sample_benchmark | |
| from picarones.report.generator import ReportGenerator | |
| bm = generate_sample_benchmark() | |
| out = tmp_path / "report_test.html" | |
| ReportGenerator(bm).generate(out) | |
| html = out.read_text(encoding="utf-8") | |
| assert "diplo" in html.lower() or "diplomatique" in html.lower(), ( | |
| "Le rapport HTML doit mentionner le CER diplomatique" | |
| ) | |
| def test_html_contains_medieval_graphie_indicator(self, tmp_path): | |
| """Le rapport doit mentionner les graphies médiévales (ſ=s ou u=v).""" | |
| from picarones.fixtures import generate_sample_benchmark | |
| from picarones.report.generator import ReportGenerator | |
| bm = generate_sample_benchmark() | |
| out = tmp_path / "report_test.html" | |
| ReportGenerator(bm).generate(out) | |
| html = out.read_text(encoding="utf-8") | |
| # Le tooltip ou la légende doit mentionner les correspondances diplomatiques | |
| assert "ſ=s" in html or "u=v" in html or "diplomatique" in html.lower() | |