Spaces:
Sleeping
Sleeping
File size: 1,569 Bytes
b0ef2dc 280d562 b0ef2dc 280d562 b0ef2dc 280d562 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
def test_chunk_by_concept_groups_correctly():
"""
Ensures that items are correctly grouped by (source_document, concept)
and that their content is concatenated in the right order.
"""
from src.fot_recommender.semantic_chunker import chunk_by_concept
# 1. Arrange: Create simple, predictable raw data
sample_raw_kb = [
{
"source_document": "doc_A",
"concept": "Mentoring",
"absolute_page": 1,
"content": "First part.",
},
{
"source_document": "doc_B",
"concept": "Tutoring",
"absolute_page": 10,
"content": "Tutoring info.",
},
{
"source_document": "doc_A",
"concept": "Mentoring",
"absolute_page": 2,
"content": "Second part.",
},
]
# 2. Act: Run the function we're testing
final_chunks = chunk_by_concept(sample_raw_kb)
# 3. Assert: Check the results
assert len(final_chunks) == 2 # Should have grouped into 2 concepts
# Find the 'Mentoring' chunk for detailed checks
mentoring_chunk = next(c for c in final_chunks if c["title"] == "Mentoring")
assert mentoring_chunk is not None
assert mentoring_chunk["source_document"] == "doc_A"
assert mentoring_chunk["fot_pages"] == "Pages: 1, 2"
assert "First part.\n\nSecond part." in mentoring_chunk["original_content"]
assert (
"Title: Mentoring. Content: First part.\n\nSecond part."
in mentoring_chunk["content_for_embedding"]
)
|