PGC-AI-Chatbot / tests /test_knowledge_chunking.py
Jacooo's picture
Deploy from GitHub: ef6915c
0fe58ad verified
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from app.knowledge_chunking import build_normalized_child_chunks, NormalizedChildChunk
def test_build_normalized_child_chunks_filters_short_chunks():
raw_chunks = [
{"text": "short", "metadata": {"source_file": "Doc.pdf", "page_number": 1}},
{"text": "A" * 120, "metadata": {"source_file": "Doc.pdf", "page_number": 1}},
]
chunks, stats = build_normalized_child_chunks(raw_chunks)
assert len(chunks) == 1
assert chunks[0].filename == "Doc.pdf"
assert chunks[0].page_number == 1
assert chunks[0].content == "A" * 120
assert stats.skipped == 1
assert stats.total_final == 1
def test_build_normalized_child_chunks_assigns_stable_child_ids_in_source_order():
raw_chunks = [
{"text": "First paragraph " * 10, "metadata": {"source_file": "Doc.pdf", "page_number": 1}},
{"text": "Second paragraph " * 10, "metadata": {"source_file": "Doc.pdf", "page_number": 2}},
]
chunks, stats = build_normalized_child_chunks(raw_chunks)
assert [c.child_id for c in chunks] == [
"Doc.pdf::p1::i0",
"Doc.pdf::p2::i1",
]
def test_build_normalized_child_chunks_splits_oversized_chunk():
# A chunk that is 2x MAX_CHUNK_CHARS should become 2 children
raw_chunks = [
{"text": "B" * 1800 + ". " + "C" * 1800, "metadata": {"source_file": "Big.pdf", "page_number": 3}},
]
chunks, stats = build_normalized_child_chunks(raw_chunks)
assert len(chunks) >= 2
assert stats.split_count == 1
for c in chunks:
assert c.filename == "Big.pdf"
assert c.page_number == 3
def test_build_normalized_child_chunks_source_label():
raw_chunks = [
{"text": "A" * 120, "metadata": {"source_file": "Cornell-CEA-Lettuce-Handbook-.pdf", "page_number": 1}},
]
chunks, stats = build_normalized_child_chunks(raw_chunks)
assert chunks[0].source == "Cornell Cea Lettuce Handbook"
def test_normalized_child_chunk_ordinal_is_global_across_sources():
"""Ordinal is global across all sources (not reset per file)."""
raw_chunks = [
{"text": "A" * 120, "metadata": {"source_file": "Doc1.pdf", "page_number": 1}},
{"text": "B" * 120, "metadata": {"source_file": "Doc2.pdf", "page_number": 1}},
]
chunks, stats = build_normalized_child_chunks(raw_chunks)
assert chunks[0].corpus_ordinal == 0
assert chunks[1].corpus_ordinal == 1
assert chunks[0].child_id == "Doc1.pdf::p1::i0"
assert chunks[1].child_id == "Doc2.pdf::p1::i1"
def test_build_normalized_child_chunks_same_page_chunks_get_unique_ids():
raw_chunks = [
{"text": "A" * 120, "metadata": {"source_file": "Doc.pdf", "page_number": 5}},
{"text": "B" * 120, "metadata": {"source_file": "Doc.pdf", "page_number": 5}},
]
chunks, stats = build_normalized_child_chunks(raw_chunks)
# ordinal disambiguates even when source + page are identical
assert chunks[0].child_id == "Doc.pdf::p5::i0"
assert chunks[1].child_id == "Doc.pdf::p5::i1"
assert len(set(c.child_id for c in chunks)) == 2