Dokumentassistent / tests /test_chunker.py
XQ
Code cleaning
9612292
raw
history blame
10.1 kB
"""Tests for text chunking strategies."""
from unittest.mock import MagicMock, patch
import pytest
from src.ingestion.chunker import (
BaseChunker,
FixedSizeChunker,
RecursiveChunker,
SemanticChunker,
_make_chunk_id,
create_chunker,
)
from src.models import ChunkStrategy, DocumentChunk
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
DANISH_TEXT = (
"Københavns Universitet har følgende regler for behandling af persondata. "
"Alle ansatte skal overholde retningslinjerne i henhold til GDPR. "
"Særlige bestemmelser gælder for håndtering af følsomme oplysninger. "
"Ændringer træder i kraft den 1. januar. "
"Spørgsmål kan rettes til databeskyttelsesrådgiveren på ældre@telefonlinje.dk."
)
DOC_ID = "doc-test-001"
META: dict[str, str | int] = {"source": "test.pdf", "page": 1}
# ---------------------------------------------------------------------------
# Helper – deterministic chunk ID
# ---------------------------------------------------------------------------
class TestMakeChunkId:
def test_deterministic(self) -> None:
assert _make_chunk_id("doc1", 0) == _make_chunk_id("doc1", 0)
def test_different_inputs(self) -> None:
assert _make_chunk_id("doc1", 0) != _make_chunk_id("doc1", 1)
assert _make_chunk_id("doc1", 0) != _make_chunk_id("doc2", 0)
def test_length(self) -> None:
assert len(_make_chunk_id("x", 0)) == 16
# ---------------------------------------------------------------------------
# Output format helpers (shared assertions)
# ---------------------------------------------------------------------------
def _assert_valid_chunks(
chunks: list[DocumentChunk],
expected_strategy: ChunkStrategy,
document_id: str = DOC_ID,
) -> None:
"""Assert that every chunk has the correct structure and strategy."""
assert isinstance(chunks, list)
assert len(chunks) > 0
for idx, chunk in enumerate(chunks):
assert isinstance(chunk, DocumentChunk)
assert chunk.document_id == document_id
assert isinstance(chunk.chunk_id, str) and len(chunk.chunk_id) == 16
assert isinstance(chunk.text, str) and len(chunk.text) > 0
assert chunk.strategy == expected_strategy
assert chunk.metadata["chunk_index"] == idx
# ---------------------------------------------------------------------------
# FixedSizeChunker
# ---------------------------------------------------------------------------
class TestFixedSizeChunker:
def test_output_format(self) -> None:
chunker = FixedSizeChunker(chunk_size=100, chunk_overlap=20)
chunks = chunker.chunk(DANISH_TEXT, DOC_ID, META)
_assert_valid_chunks(chunks, ChunkStrategy.FIXED_SIZE)
def test_chunk_size_respected(self) -> None:
chunker = FixedSizeChunker(chunk_size=50, chunk_overlap=10)
chunks = chunker.chunk(DANISH_TEXT, DOC_ID, META)
for chunk in chunks:
assert len(chunk.text) <= 50
def test_overlap(self) -> None:
chunker = FixedSizeChunker(chunk_size=60, chunk_overlap=20)
chunks = chunker.chunk(DANISH_TEXT, DOC_ID, META)
if len(chunks) >= 2:
tail = chunks[0].text[-20:]
assert chunks[1].text.startswith(tail)
def test_empty_text(self) -> None:
chunker = FixedSizeChunker(chunk_size=100, chunk_overlap=20)
chunks = chunker.chunk("", DOC_ID, META)
assert chunks == []
def test_short_text(self) -> None:
chunker = FixedSizeChunker(chunk_size=500, chunk_overlap=50)
chunks = chunker.chunk("Hej", DOC_ID, META)
assert len(chunks) == 1
assert chunks[0].text == "Hej"
assert chunks[0].strategy == ChunkStrategy.FIXED_SIZE
def test_danish_characters_preserved(self) -> None:
text = "æble, ørred, åben"
chunker = FixedSizeChunker(chunk_size=500, chunk_overlap=0)
chunks = chunker.chunk(text, DOC_ID, META)
assert chunks[0].text == text
def test_metadata_propagated(self) -> None:
chunker = FixedSizeChunker(chunk_size=500, chunk_overlap=0)
chunks = chunker.chunk(DANISH_TEXT, DOC_ID, META)
for chunk in chunks:
assert chunk.metadata["source"] == "test.pdf"
assert chunk.metadata["page"] == 1
# ---------------------------------------------------------------------------
# RecursiveChunker
# ---------------------------------------------------------------------------
class TestRecursiveChunker:
def test_output_format(self) -> None:
chunker = RecursiveChunker(chunk_size=100, chunk_overlap=20)
chunks = chunker.chunk(DANISH_TEXT, DOC_ID, META)
_assert_valid_chunks(chunks, ChunkStrategy.RECURSIVE)
def test_empty_text(self) -> None:
chunker = RecursiveChunker(chunk_size=100, chunk_overlap=20)
chunks = chunker.chunk("", DOC_ID, META)
assert chunks == []
def test_short_text(self) -> None:
chunker = RecursiveChunker(chunk_size=500, chunk_overlap=50)
chunks = chunker.chunk("Hej", DOC_ID, META)
assert len(chunks) == 1
assert chunks[0].text == "Hej"
assert chunks[0].strategy == ChunkStrategy.RECURSIVE
def test_danish_characters_preserved(self) -> None:
text = "Håndtering af ældre dokumenter kræver særlig opmærksomhed fra ændringsledelsen."
chunker = RecursiveChunker(chunk_size=500, chunk_overlap=0)
chunks = chunker.chunk(text, DOC_ID, META)
assert chunks[0].text == text
def test_splits_long_text(self) -> None:
chunker = RecursiveChunker(chunk_size=80, chunk_overlap=10)
chunks = chunker.chunk(DANISH_TEXT, DOC_ID, META)
assert len(chunks) > 1
# ---------------------------------------------------------------------------
# SemanticChunker (requires a mock embeddings instance)
# ---------------------------------------------------------------------------
class TestSemanticChunker:
@patch("src.ingestion.chunker.LCSemanticChunker")
def test_output_format(self, mock_lc_chunker_cls: MagicMock) -> None:
fake_doc_1 = MagicMock()
fake_doc_1.page_content = "Første del af teksten."
fake_doc_2 = MagicMock()
fake_doc_2.page_content = "Anden del af teksten."
mock_lc_chunker_cls.return_value.create_documents.return_value = [
fake_doc_1,
fake_doc_2,
]
mock_embeddings = MagicMock()
chunker = SemanticChunker(chunk_size=100, chunk_overlap=20, embeddings=mock_embeddings)
chunks = chunker.chunk(DANISH_TEXT, DOC_ID, META)
_assert_valid_chunks(chunks, ChunkStrategy.SEMANTIC)
assert len(chunks) == 2
assert chunks[0].text == "Første del af teksten."
assert chunks[1].text == "Anden del af teksten."
@patch("src.ingestion.chunker.LCSemanticChunker")
def test_empty_text(self, mock_lc_chunker_cls: MagicMock) -> None:
mock_lc_chunker_cls.return_value.create_documents.return_value = []
mock_embeddings = MagicMock()
chunker = SemanticChunker(chunk_size=100, chunk_overlap=20, embeddings=mock_embeddings)
chunks = chunker.chunk("", DOC_ID, META)
assert chunks == []
@patch("src.ingestion.chunker.LCSemanticChunker")
def test_short_text(self, mock_lc_chunker_cls: MagicMock) -> None:
fake_doc = MagicMock()
fake_doc.page_content = "Hej"
mock_lc_chunker_cls.return_value.create_documents.return_value = [fake_doc]
mock_embeddings = MagicMock()
chunker = SemanticChunker(chunk_size=500, chunk_overlap=50, embeddings=mock_embeddings)
chunks = chunker.chunk("Hej", DOC_ID, META)
assert len(chunks) == 1
assert chunks[0].text == "Hej"
assert chunks[0].strategy == ChunkStrategy.SEMANTIC
@patch("src.ingestion.chunker.LCSemanticChunker")
def test_danish_characters_preserved(self, mock_lc_chunker_cls: MagicMock) -> None:
text = "Ændringsforslag vedrørende årsregnskabet"
fake_doc = MagicMock()
fake_doc.page_content = text
mock_lc_chunker_cls.return_value.create_documents.return_value = [fake_doc]
mock_embeddings = MagicMock()
chunker = SemanticChunker(chunk_size=500, chunk_overlap=0, embeddings=mock_embeddings)
chunks = chunker.chunk(text, DOC_ID, META)
assert chunks[0].text == text
# ---------------------------------------------------------------------------
# Factory: create_chunker
# ---------------------------------------------------------------------------
class TestCreateChunker:
def test_fixed_size(self) -> None:
chunker = create_chunker(ChunkStrategy.FIXED_SIZE, 100, 20)
assert isinstance(chunker, FixedSizeChunker)
def test_recursive(self) -> None:
chunker = create_chunker(ChunkStrategy.RECURSIVE, 100, 20)
assert isinstance(chunker, RecursiveChunker)
def test_semantic(self) -> None:
mock_embeddings = MagicMock()
chunker = create_chunker(ChunkStrategy.SEMANTIC, 100, 20, embeddings=mock_embeddings)
assert isinstance(chunker, SemanticChunker)
def test_semantic_without_embeddings_raises(self) -> None:
with pytest.raises(ValueError, match="Embeddings instance is required"):
create_chunker(ChunkStrategy.SEMANTIC, 100, 20)
def test_unknown_strategy_raises(self) -> None:
with pytest.raises(ValueError, match="Unknown chunking strategy"):
create_chunker("invalid", 100, 20) # type: ignore[arg-type]
# ---------------------------------------------------------------------------
# BaseChunker – not implemented guard
# ---------------------------------------------------------------------------
class TestBaseChunker:
def test_chunk_raises_not_implemented(self) -> None:
base = BaseChunker(chunk_size=100, chunk_overlap=20)
with pytest.raises(NotImplementedError):
base.chunk("text", DOC_ID, META)