Spaces:
Running
Running
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from app.knowledge_chunking import build_normalized_child_chunks, NormalizedChildChunk | |
| def test_build_normalized_child_chunks_filters_short_chunks(): | |
| raw_chunks = [ | |
| {"text": "short", "metadata": {"source_file": "Doc.pdf", "page_number": 1}}, | |
| {"text": "A" * 120, "metadata": {"source_file": "Doc.pdf", "page_number": 1}}, | |
| ] | |
| chunks, stats = build_normalized_child_chunks(raw_chunks) | |
| assert len(chunks) == 1 | |
| assert chunks[0].filename == "Doc.pdf" | |
| assert chunks[0].page_number == 1 | |
| assert chunks[0].content == "A" * 120 | |
| assert stats.skipped == 1 | |
| assert stats.total_final == 1 | |
| def test_build_normalized_child_chunks_assigns_stable_child_ids_in_source_order(): | |
| raw_chunks = [ | |
| {"text": "First paragraph " * 10, "metadata": {"source_file": "Doc.pdf", "page_number": 1}}, | |
| {"text": "Second paragraph " * 10, "metadata": {"source_file": "Doc.pdf", "page_number": 2}}, | |
| ] | |
| chunks, stats = build_normalized_child_chunks(raw_chunks) | |
| assert [c.child_id for c in chunks] == [ | |
| "Doc.pdf::p1::i0", | |
| "Doc.pdf::p2::i1", | |
| ] | |
| def test_build_normalized_child_chunks_splits_oversized_chunk(): | |
| # A chunk that is 2x MAX_CHUNK_CHARS should become 2 children | |
| raw_chunks = [ | |
| {"text": "B" * 1800 + ". " + "C" * 1800, "metadata": {"source_file": "Big.pdf", "page_number": 3}}, | |
| ] | |
| chunks, stats = build_normalized_child_chunks(raw_chunks) | |
| assert len(chunks) >= 2 | |
| assert stats.split_count == 1 | |
| for c in chunks: | |
| assert c.filename == "Big.pdf" | |
| assert c.page_number == 3 | |
| def test_build_normalized_child_chunks_source_label(): | |
| raw_chunks = [ | |
| {"text": "A" * 120, "metadata": {"source_file": "Cornell-CEA-Lettuce-Handbook-.pdf", "page_number": 1}}, | |
| ] | |
| chunks, stats = build_normalized_child_chunks(raw_chunks) | |
| assert chunks[0].source == "Cornell Cea Lettuce Handbook" | |
| def test_normalized_child_chunk_ordinal_is_global_across_sources(): | |
| """Ordinal is global across all sources (not reset per file).""" | |
| raw_chunks = [ | |
| {"text": "A" * 120, "metadata": {"source_file": "Doc1.pdf", "page_number": 1}}, | |
| {"text": "B" * 120, "metadata": {"source_file": "Doc2.pdf", "page_number": 1}}, | |
| ] | |
| chunks, stats = build_normalized_child_chunks(raw_chunks) | |
| assert chunks[0].corpus_ordinal == 0 | |
| assert chunks[1].corpus_ordinal == 1 | |
| assert chunks[0].child_id == "Doc1.pdf::p1::i0" | |
| assert chunks[1].child_id == "Doc2.pdf::p1::i1" | |
| def test_build_normalized_child_chunks_same_page_chunks_get_unique_ids(): | |
| raw_chunks = [ | |
| {"text": "A" * 120, "metadata": {"source_file": "Doc.pdf", "page_number": 5}}, | |
| {"text": "B" * 120, "metadata": {"source_file": "Doc.pdf", "page_number": 5}}, | |
| ] | |
| chunks, stats = build_normalized_child_chunks(raw_chunks) | |
| # ordinal disambiguates even when source + page are identical | |
| assert chunks[0].child_id == "Doc.pdf::p5::i0" | |
| assert chunks[1].child_id == "Doc.pdf::p5::i1" | |
| assert len(set(c.child_id for c in chunks)) == 2 | |