import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from app.knowledge_chunking import build_normalized_child_chunks, NormalizedChildChunk def test_build_normalized_child_chunks_filters_short_chunks(): raw_chunks = [ {"text": "short", "metadata": {"source_file": "Doc.pdf", "page_number": 1}}, {"text": "A" * 120, "metadata": {"source_file": "Doc.pdf", "page_number": 1}}, ] chunks, stats = build_normalized_child_chunks(raw_chunks) assert len(chunks) == 1 assert chunks[0].filename == "Doc.pdf" assert chunks[0].page_number == 1 assert chunks[0].content == "A" * 120 assert stats.skipped == 1 assert stats.total_final == 1 def test_build_normalized_child_chunks_assigns_stable_child_ids_in_source_order(): raw_chunks = [ {"text": "First paragraph " * 10, "metadata": {"source_file": "Doc.pdf", "page_number": 1}}, {"text": "Second paragraph " * 10, "metadata": {"source_file": "Doc.pdf", "page_number": 2}}, ] chunks, stats = build_normalized_child_chunks(raw_chunks) assert [c.child_id for c in chunks] == [ "Doc.pdf::p1::i0", "Doc.pdf::p2::i1", ] def test_build_normalized_child_chunks_splits_oversized_chunk(): # A chunk that is 2x MAX_CHUNK_CHARS should become 2 children raw_chunks = [ {"text": "B" * 1800 + ". " + "C" * 1800, "metadata": {"source_file": "Big.pdf", "page_number": 3}}, ] chunks, stats = build_normalized_child_chunks(raw_chunks) assert len(chunks) >= 2 assert stats.split_count == 1 for c in chunks: assert c.filename == "Big.pdf" assert c.page_number == 3 def test_build_normalized_child_chunks_source_label(): raw_chunks = [ {"text": "A" * 120, "metadata": {"source_file": "Cornell-CEA-Lettuce-Handbook-.pdf", "page_number": 1}}, ] chunks, stats = build_normalized_child_chunks(raw_chunks) assert chunks[0].source == "Cornell Cea Lettuce Handbook" def test_normalized_child_chunk_ordinal_is_global_across_sources(): """Ordinal is global across all sources (not reset per file).""" raw_chunks = [ {"text": "A" * 120, "metadata": {"source_file": "Doc1.pdf", "page_number": 1}}, {"text": "B" * 120, "metadata": {"source_file": "Doc2.pdf", "page_number": 1}}, ] chunks, stats = build_normalized_child_chunks(raw_chunks) assert chunks[0].corpus_ordinal == 0 assert chunks[1].corpus_ordinal == 1 assert chunks[0].child_id == "Doc1.pdf::p1::i0" assert chunks[1].child_id == "Doc2.pdf::p1::i1" def test_build_normalized_child_chunks_same_page_chunks_get_unique_ids(): raw_chunks = [ {"text": "A" * 120, "metadata": {"source_file": "Doc.pdf", "page_number": 5}}, {"text": "B" * 120, "metadata": {"source_file": "Doc.pdf", "page_number": 5}}, ] chunks, stats = build_normalized_child_chunks(raw_chunks) # ordinal disambiguates even when source + page are identical assert chunks[0].child_id == "Doc.pdf::p5::i0" assert chunks[1].child_id == "Doc.pdf::p5::i1" assert len(set(c.child_id for c in chunks)) == 2