import pytest from langchain_core.documents import Document from ingestion.chunker import chunker @pytest.mark.unit class TestChunker: def test_split_documents(self, sample_document_text): docs = [Document(page_content=sample_document_text * 10)] chunks = chunker.split_documents(docs) assert len(chunks) > 0 assert all(isinstance(chunk, Document) for chunk in chunks) assert all(len(chunk.page_content) <= 512 + 50 for chunk in chunks) def test_chunk_metadata_preserved(self): doc = Document( page_content="This is a test document. " * 100, metadata={"source": "test.pdf", "page": 1} ) chunks = chunker.split_documents([doc]) assert all(chunk.metadata.get("source") == "test.pdf" for chunk in chunks) assert all(chunk.metadata.get("page") == 1 for chunk in chunks)