DeveloperDocs_RAG / test_chunking.py
Aishwarya30998's picture
Deploy DeveloperDocs-AI-Copilot-RAG to Hugging Face Space
14f13a5
"""
Tests for document chunking functionality.
"""
import pytest
from src.chunking import SemanticChunker, DocumentChunk
@pytest.fixture
def chunker():
"""Create a chunker instance for testing."""
return SemanticChunker(chunk_size=200, chunk_overlap=50)
def test_basic_chunking(chunker):
"""Test basic document chunking."""
text = """
FastAPI is a modern, fast (high-performance) web framework.
It is based on standard Python type hints.
The key features are:
- Fast: Very high performance
- Fast to code: Increase development speed
- Fewer bugs: Reduce human errors
"""
chunks = chunker.chunk_document(text)
assert len(chunks) > 0
assert all(isinstance(chunk, DocumentChunk) for chunk in chunks)
assert all(chunk.content for chunk in chunks)
def test_chunk_metadata(chunker):
"""Test that metadata is properly attached."""
text = "FastAPI is awesome."
metadata = {
"source": "test.md",
"title": "Test Document",
"url": "https://example.com"
}
chunks = chunker.chunk_document(text, metadata=metadata)
assert len(chunks) > 0
chunk = chunks[0]
assert chunk.metadata["source"] == "test.md"
assert chunk.metadata["title"] == "Test Document"
assert chunk.metadata["url"] == "https://example.com"
assert "chunk_index" in chunk.metadata
def test_code_block_preservation(chunker):
"""Test that code blocks are preserved."""
text = """
Here's an example:
```python
from fastapi import FastAPI
app = FastAPI()
```
This creates an app.
"""
chunks = chunker.chunk_document(text)
# Code block should be preserved
combined_content = " ".join(chunk.content for chunk in chunks)
assert "```python" in combined_content
assert "FastAPI" in combined_content
def test_empty_text(chunker):
"""Test handling of empty text."""
chunks = chunker.chunk_document("")
assert chunks == []
chunks = chunker.chunk_document(" ")
assert chunks == []
def test_to_dict(chunker):
"""Test DocumentChunk serialization."""
text = "Test content"
metadata = {"source": "test"}
chunks = chunker.chunk_document(text, metadata=metadata)
chunk = chunks[0]
chunk_dict = chunk.to_dict()
assert "content" in chunk_dict
assert "metadata" in chunk_dict
assert "chunk_id" in chunk_dict
assert chunk_dict["content"] == chunk.content