Spaces:
Sleeping
Sleeping
| """ | |
| Tests for document chunking functionality. | |
| """ | |
| import pytest | |
| from src.chunking import SemanticChunker, DocumentChunk | |
| def chunker(): | |
| """Create a chunker instance for testing.""" | |
| return SemanticChunker(chunk_size=200, chunk_overlap=50) | |
| def test_basic_chunking(chunker): | |
| """Test basic document chunking.""" | |
| text = """ | |
| FastAPI is a modern, fast (high-performance) web framework. | |
| It is based on standard Python type hints. | |
| The key features are: | |
| - Fast: Very high performance | |
| - Fast to code: Increase development speed | |
| - Fewer bugs: Reduce human errors | |
| """ | |
| chunks = chunker.chunk_document(text) | |
| assert len(chunks) > 0 | |
| assert all(isinstance(chunk, DocumentChunk) for chunk in chunks) | |
| assert all(chunk.content for chunk in chunks) | |
| def test_chunk_metadata(chunker): | |
| """Test that metadata is properly attached.""" | |
| text = "FastAPI is awesome." | |
| metadata = { | |
| "source": "test.md", | |
| "title": "Test Document", | |
| "url": "https://example.com" | |
| } | |
| chunks = chunker.chunk_document(text, metadata=metadata) | |
| assert len(chunks) > 0 | |
| chunk = chunks[0] | |
| assert chunk.metadata["source"] == "test.md" | |
| assert chunk.metadata["title"] == "Test Document" | |
| assert chunk.metadata["url"] == "https://example.com" | |
| assert "chunk_index" in chunk.metadata | |
| def test_code_block_preservation(chunker): | |
| """Test that code blocks are preserved.""" | |
| text = """ | |
| Here's an example: | |
| ```python | |
| from fastapi import FastAPI | |
| app = FastAPI() | |
| ``` | |
| This creates an app. | |
| """ | |
| chunks = chunker.chunk_document(text) | |
| # Code block should be preserved | |
| combined_content = " ".join(chunk.content for chunk in chunks) | |
| assert "```python" in combined_content | |
| assert "FastAPI" in combined_content | |
| def test_empty_text(chunker): | |
| """Test handling of empty text.""" | |
| chunks = chunker.chunk_document("") | |
| assert chunks == [] | |
| chunks = chunker.chunk_document(" ") | |
| assert chunks == [] | |
| def test_to_dict(chunker): | |
| """Test DocumentChunk serialization.""" | |
| text = "Test content" | |
| metadata = {"source": "test"} | |
| chunks = chunker.chunk_document(text, metadata=metadata) | |
| chunk = chunks[0] | |
| chunk_dict = chunk.to_dict() | |
| assert "content" in chunk_dict | |
| assert "metadata" in chunk_dict | |
| assert "chunk_id" in chunk_dict | |
| assert chunk_dict["content"] == chunk.content | |