Spaces:
Sleeping
Sleeping
File size: 1,523 Bytes
a34068e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | from app.core.chunker import chunk_text
def test_empty_text():
assert chunk_text("") == []
assert chunk_text(" ") == []
def test_single_sentence():
chunks = chunk_text("This is a single sentence.", chunk_size=100)
assert len(chunks) == 1
assert chunks[0]["text"] == "This is a single sentence."
assert chunks[0]["chunk_index"] == 0
def test_multiple_chunks():
text = "First sentence here. Second sentence here. Third sentence here. Fourth sentence here. Fifth sentence here."
chunks = chunk_text(text, chunk_size=5, chunk_overlap=2)
assert len(chunks) > 1
for i, chunk in enumerate(chunks):
assert chunk["chunk_index"] == i
assert chunk["text"]
assert chunk["start_char"] >= 0
assert chunk["end_char"] > chunk["start_char"]
def test_overlap_present():
text = "Alpha bravo charlie delta. Echo foxtrot golf hotel. India juliet kilo lima."
chunks = chunk_text(text, chunk_size=4, chunk_overlap=2)
if len(chunks) > 1:
first_words = chunks[0]["text"].split()
second_words = chunks[1]["text"].split()
overlap = set(first_words[-2:]) & set(second_words[:2])
assert len(overlap) > 0
def test_chunk_size_respected():
text = " ".join(["word"] * 100) + "."
chunks = chunk_text(text, chunk_size=20, chunk_overlap=5)
for chunk in chunks[:-1]: # Last chunk can be smaller
word_count = len(chunk["text"].split())
assert word_count <= 25 # Allow some slack for sentence boundaries
|