document-extract-agent / tests /test_detect.py
kennethzychew's picture
phase 2.1: modality detection
e9e88a5
Raw
History Blame Contribute Delete
3.68 kB
"""Unit tests for modality detection.
Covers the acceptance criteria for build-plan task 2.1: every supported
extension is classified correctly as ``native_pdf`` or ``image``. Also exercises
case-insensitivity, ``str``/``Path`` inputs, paths with directories, the MIME
fallback, the unsupported-type error, the ``is_supported`` predicate, and the
purity guarantee (no filesystem access).
"""
from __future__ import annotations
from pathlib import Path
import pytest
from doc_agent.parsing.detect import (
IMAGE_EXTENSIONS,
NATIVE_PDF_EXTENSIONS,
UnsupportedModalityError,
detect_modality,
is_supported,
)
@pytest.mark.parametrize("ext", sorted(NATIVE_PDF_EXTENSIONS))
def test_native_pdf_extensions(ext: str) -> None:
"""Every PDF extension classifies as ``native_pdf``."""
assert detect_modality(f"invoice{ext}") == "native_pdf"
@pytest.mark.parametrize("ext", sorted(IMAGE_EXTENSIONS))
def test_image_extensions(ext: str) -> None:
"""Every supported image extension classifies as ``image``."""
assert detect_modality(f"receipt{ext}") == "image"
@pytest.mark.parametrize(
("name", "expected"),
[
("scan.PDF", "native_pdf"),
("photo.JPG", "image"),
("photo.JpEg", "image"),
("page.TIFF", "image"),
],
)
def test_extension_match_is_case_insensitive(name: str, expected: str) -> None:
"""Extension matching ignores case."""
assert detect_modality(name) == expected
def test_accepts_path_object() -> None:
"""A ``Path`` input is handled identically to a string."""
assert detect_modality(Path("docs/invoice.pdf")) == "native_pdf"
def test_full_path_with_directories() -> None:
"""Only the filename's extension matters, not the leading directories."""
assert detect_modality("/data/inbox/2024/q1/receipt.png") == "image"
assert detect_modality(r"C:\data\inbox\bill.pdf") == "native_pdf"
def test_filename_with_multiple_dots() -> None:
"""Only the final suffix is used for classification."""
assert detect_modality("vendor.invoice.final.pdf") == "native_pdf"
assert detect_modality("scan.v2.tiff") == "image"
def test_mime_fallback_for_unenumerated_extension() -> None:
"""An extension absent from the explicit sets falls back to the MIME guess.
``.tif`` is in the explicit image set; ``.jfif`` is a real JPEG extension
that the standard library maps to ``image/jpeg`` but which is intentionally
not enumerated, so it exercises the fallback path.
"""
assert ".jfif" not in IMAGE_EXTENSIONS
assert detect_modality("photo.jfif") == "image"
@pytest.mark.parametrize("name", ["notes.txt", "report.docx", "archive.zip", "noext"])
def test_unsupported_type_raises(name: str) -> None:
"""An unsupported or extension-less file raises a clear error."""
with pytest.raises(UnsupportedModalityError) as exc_info:
detect_modality(name)
# The message is actionable: it names the offending file and the supported set.
message = str(exc_info.value)
assert name in message
assert "pdf" in message.lower()
def test_is_supported_predicate() -> None:
"""``is_supported`` mirrors ``detect_modality`` without raising."""
assert is_supported("invoice.pdf") is True
assert is_supported("receipt.png") is True
assert is_supported("notes.txt") is False
assert is_supported("noext") is False
def test_detection_does_not_touch_the_filesystem() -> None:
"""Detection is pure: a nonexistent path still classifies by name alone."""
nonexistent = Path("this/path/does/not/exist/phantom.pdf")
assert not nonexistent.exists()
assert detect_modality(nonexistent) == "native_pdf"