Spaces:
Sleeping
Sleeping
| from app.core.chunker import chunk_text | |
| def test_empty_text(): | |
| assert chunk_text("") == [] | |
| assert chunk_text(" ") == [] | |
| def test_single_sentence(): | |
| chunks = chunk_text("This is a single sentence.", chunk_size=100) | |
| assert len(chunks) == 1 | |
| assert chunks[0]["text"] == "This is a single sentence." | |
| assert chunks[0]["chunk_index"] == 0 | |
| def test_multiple_chunks(): | |
| text = "First sentence here. Second sentence here. Third sentence here. Fourth sentence here. Fifth sentence here." | |
| chunks = chunk_text(text, chunk_size=5, chunk_overlap=2) | |
| assert len(chunks) > 1 | |
| for i, chunk in enumerate(chunks): | |
| assert chunk["chunk_index"] == i | |
| assert chunk["text"] | |
| assert chunk["start_char"] >= 0 | |
| assert chunk["end_char"] > chunk["start_char"] | |
| def test_overlap_present(): | |
| text = "Alpha bravo charlie delta. Echo foxtrot golf hotel. India juliet kilo lima." | |
| chunks = chunk_text(text, chunk_size=4, chunk_overlap=2) | |
| if len(chunks) > 1: | |
| first_words = chunks[0]["text"].split() | |
| second_words = chunks[1]["text"].split() | |
| overlap = set(first_words[-2:]) & set(second_words[:2]) | |
| assert len(overlap) > 0 | |
| def test_chunk_size_respected(): | |
| text = " ".join(["word"] * 100) + "." | |
| chunks = chunk_text(text, chunk_size=20, chunk_overlap=5) | |
| for chunk in chunks[:-1]: # Last chunk can be smaller | |
| word_count = len(chunk["text"].split()) | |
| assert word_count <= 25 # Allow some slack for sentence boundaries | |