File size: 1,569 Bytes
b0ef2dc
 
 
 
 
 
 
 
 
280d562
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0ef2dc
 
 
 
 
 
 
 
 
 
280d562
b0ef2dc
 
 
 
280d562
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def test_chunk_by_concept_groups_correctly():
    """
    Ensures that items are correctly grouped by (source_document, concept)
    and that their content is concatenated in the right order.
    """
    from src.fot_recommender.semantic_chunker import chunk_by_concept

    # 1. Arrange: Create simple, predictable raw data
    sample_raw_kb = [
        {
            "source_document": "doc_A",
            "concept": "Mentoring",
            "absolute_page": 1,
            "content": "First part.",
        },
        {
            "source_document": "doc_B",
            "concept": "Tutoring",
            "absolute_page": 10,
            "content": "Tutoring info.",
        },
        {
            "source_document": "doc_A",
            "concept": "Mentoring",
            "absolute_page": 2,
            "content": "Second part.",
        },
    ]

    # 2. Act: Run the function we're testing
    final_chunks = chunk_by_concept(sample_raw_kb)

    # 3. Assert: Check the results
    assert len(final_chunks) == 2  # Should have grouped into 2 concepts

    # Find the 'Mentoring' chunk for detailed checks
    mentoring_chunk = next(c for c in final_chunks if c["title"] == "Mentoring")

    assert mentoring_chunk is not None
    assert mentoring_chunk["source_document"] == "doc_A"
    assert mentoring_chunk["fot_pages"] == "Pages: 1, 2"
    assert "First part.\n\nSecond part." in mentoring_chunk["original_content"]
    assert (
        "Title: Mentoring. Content: First part.\n\nSecond part."
        in mentoring_chunk["content_for_embedding"]
    )