Picarones / tests /domain /test_artifacts.py
Claude
test(rename): dΓ©-sprintage tests/domain (7 fichiers, git mv)
ade2cbf unverified
"""Sprint A14-S4 β€” ``Artifact`` et ``ArtifactType``.
VΓ©rifie les invariants des artefacts du nouveau domain : validation
des id, hash, immutabilitΓ©, sΓ©rialisation JSON dΓ©terministe.
Note : pas de test "logique mΓ©tier" ici β€” un Artifact ne fait rien,
il dΓ©crit. Les tests qui valident le comportement viendront avec
le pipeline executor (S7) qui produit et consomme des artefacts.
"""
from __future__ import annotations
import hashlib
import pytest
from pydantic import ValidationError
from picarones.domain import (
Artifact,
ArtifactType,
ArtifactValidationError,
ProvenanceRecord,
compute_content_hash,
)
def _prov() -> ProvenanceRecord:
return ProvenanceRecord(code_version="1.0.0", parameters_hash="a" * 64)
# ──────────────────────────────────────────────────────────────────────
# ArtifactType
# ──────────────────────────────────────────────────────────────────────
class TestArtifactType:
def test_canonical_values(self) -> None:
"""Sprint A14-S4 β€” valeurs canoniques (9 jusqu'au S49 ;
+``confidences`` ajoutΓ© au S50 pour le sidecar JSON OCR).
"""
expected = {
"image", "raw_text", "corrected_text",
"alto_xml", "page_xml", "canonical_document",
"entities", "reading_order", "alignment",
"confidences",
}
assert {t.value for t in ArtifactType} == expected
def test_string_enum_serializes_as_value(self) -> None:
"""``ArtifactType`` hΓ©rite de ``str`` β†’ JSON en string brute."""
assert ArtifactType.RAW_TEXT == "raw_text"
assert ArtifactType("alto_xml") is ArtifactType.ALTO_XML
# ──────────────────────────────────────────────────────────────────────
# compute_content_hash
# ──────────────────────────────────────────────────────────────────────
class TestComputeContentHash:
def test_returns_64_char_hex(self) -> None:
h = compute_content_hash(b"hello")
assert len(h) == 64
assert int(h, 16) >= 0 # hex valide
def test_deterministic(self) -> None:
assert compute_content_hash(b"abc") == compute_content_hash(b"abc")
def test_matches_sha256(self) -> None:
h = compute_content_hash(b"picarones")
assert h == hashlib.sha256(b"picarones").hexdigest()
# ──────────────────────────────────────────────────────────────────────
# Artifact β€” crΓ©ation et validation
# ──────────────────────────────────────────────────────────────────────
class TestArtifactCreation:
def test_minimal_artifact(self) -> None:
a = Artifact(id="x", document_id="d1", type=ArtifactType.RAW_TEXT)
assert a.id == "x"
assert a.uri is None
assert a.content_hash is None
assert a.produced_by_step is None
assert a.provenance is None
def test_full_artifact(self) -> None:
a = Artifact(
id="d1:ocr:raw_text",
document_id="d1",
type=ArtifactType.RAW_TEXT,
uri="/tmp/x.txt",
content_hash="b" * 64,
produced_by_step="ocr",
provenance=_prov(),
)
assert a.produced_by_step == "ocr"
def test_id_validation_rejects_spaces(self) -> None:
with pytest.raises(ArtifactValidationError, match="id invalide"):
Artifact(id="bad id", document_id="d1", type=ArtifactType.RAW_TEXT)
def test_id_validation_rejects_null_byte(self) -> None:
with pytest.raises(ArtifactValidationError):
Artifact(id="x\x00y", document_id="d1", type=ArtifactType.RAW_TEXT)
def test_id_accepts_filesystem_safe_chars(self) -> None:
# alphanum + ``_.-:/`` selon le regex.
a = Artifact(
id="vol_a:folio.001-r/raw_text",
document_id="vol_a/folio.001-r",
type=ArtifactType.RAW_TEXT,
)
assert a.id == "vol_a:folio.001-r/raw_text"
def test_content_hash_must_be_64_hex(self) -> None:
# Trop court
with pytest.raises(ValidationError):
Artifact(
id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
content_hash="abc",
)
# Bonne longueur mais pas hex
with pytest.raises(ArtifactValidationError, match="hex SHA-256"):
Artifact(
id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
content_hash="z" * 64,
)
def test_content_hash_lowercased(self) -> None:
a = Artifact(
id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
content_hash="A" * 64,
)
assert a.content_hash == "a" * 64
# ──────────────────────────────────────────────────────────────────────
# Artifact β€” immutabilitΓ©
# ──────────────────────────────────────────────────────────────────────
class TestArtifactImmutability:
def test_frozen_blocks_attribute_mutation(self) -> None:
a = Artifact(id="x", document_id="d1", type=ArtifactType.RAW_TEXT)
with pytest.raises(ValidationError):
a.id = "y" # type: ignore[misc]
def test_extra_fields_rejected(self) -> None:
with pytest.raises(ValidationError):
Artifact( # type: ignore[call-arg]
id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
bogus_field="oops",
)
# ──────────────────────────────────────────────────────────────────────
# Artifact β€” sΓ©rialisation dΓ©terministe
# ──────────────────────────────────────────────────────────────────────
class TestArtifactSerialization:
def test_json_roundtrip_preserves_equality(self) -> None:
a = Artifact(
id="d1:ocr:raw_text", document_id="d1",
type=ArtifactType.RAW_TEXT, content_hash="c" * 64,
produced_by_step="ocr", provenance=_prov(),
)
j = a.model_dump_json()
a2 = Artifact.model_validate_json(j)
assert a == a2
def test_json_is_byte_deterministic(self) -> None:
"""MΓͺme contenu β†’ mΓͺmes octets exacts. Indispensable au cache
d'artefacts du Sprint S7."""
a1 = Artifact(
id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
content_hash="d" * 64,
)
a2 = Artifact(
id="x", document_id="d1", type=ArtifactType.RAW_TEXT,
content_hash="d" * 64,
)
assert a1.model_dump_json() == a2.model_dump_json()
def test_artifacts_are_hashable(self) -> None:
"""Frozen pydantic models sont hashables β€” on peut les mettre
dans un set ou utiliser comme clΓ© de dict."""
a = Artifact(id="x", document_id="d1", type=ArtifactType.RAW_TEXT)
s = {a}
assert a in s