File size: 2,514 Bytes
14f13a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
Tests for document chunking functionality.
"""
import pytest
from src.chunking import SemanticChunker, DocumentChunk


@pytest.fixture
def chunker():
    """Create a chunker instance for testing."""
    return SemanticChunker(chunk_size=200, chunk_overlap=50)


def test_basic_chunking(chunker):
    """Test basic document chunking."""
    text = """
    FastAPI is a modern, fast (high-performance) web framework.
    
    It is based on standard Python type hints.
    
    The key features are:
    - Fast: Very high performance
    - Fast to code: Increase development speed
    - Fewer bugs: Reduce human errors
    """
    
    chunks = chunker.chunk_document(text)
    
    assert len(chunks) > 0
    assert all(isinstance(chunk, DocumentChunk) for chunk in chunks)
    assert all(chunk.content for chunk in chunks)


def test_chunk_metadata(chunker):
    """Test that metadata is properly attached."""
    text = "FastAPI is awesome."
    metadata = {
        "source": "test.md",
        "title": "Test Document",
        "url": "https://example.com"
    }
    
    chunks = chunker.chunk_document(text, metadata=metadata)
    
    assert len(chunks) > 0
    chunk = chunks[0]
    
    assert chunk.metadata["source"] == "test.md"
    assert chunk.metadata["title"] == "Test Document"
    assert chunk.metadata["url"] == "https://example.com"
    assert "chunk_index" in chunk.metadata


def test_code_block_preservation(chunker):
    """Test that code blocks are preserved."""
    text = """
    Here's an example:
    
    ```python
    from fastapi import FastAPI
    app = FastAPI()
    ```
    
    This creates an app.
    """
    
    chunks = chunker.chunk_document(text)
    
    # Code block should be preserved
    combined_content = " ".join(chunk.content for chunk in chunks)
    assert "```python" in combined_content
    assert "FastAPI" in combined_content


def test_empty_text(chunker):
    """Test handling of empty text."""
    chunks = chunker.chunk_document("")
    assert chunks == []
    
    chunks = chunker.chunk_document("   ")
    assert chunks == []


def test_to_dict(chunker):
    """Test DocumentChunk serialization."""
    text = "Test content"
    metadata = {"source": "test"}
    
    chunks = chunker.chunk_document(text, metadata=metadata)
    chunk = chunks[0]
    
    chunk_dict = chunk.to_dict()
    
    assert "content" in chunk_dict
    assert "metadata" in chunk_dict
    assert "chunk_id" in chunk_dict
    assert chunk_dict["content"] == chunk.content