File size: 2,933 Bytes
84f4fa5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# file: chunking.py
import uuid
from typing import List, Tuple, Dict, Any
from langchain_core.documents import Document
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- Configuration for Parent-Child Splitting ---
# Parent chunks are the larger documents passed to the LLM for context.
PARENT_CHUNK_SIZE = 2000
PARENT_CHUNK_OVERLAP = 200

# Child chunks are the smaller, more granular documents used for retrieval.
CHILD_CHUNK_SIZE = 400
CHILD_CHUNK_OVERLAP = 100

def create_parent_child_chunks(
    full_text: str
) -> Tuple[List[Document], InMemoryStore, Dict[str, str]]:
    """
    Implements the Parent Document strategy for chunking.

    1. Splits the document into larger "parent" chunks.
    2. Splits the parent chunks into smaller "child" chunks.
    3. The child chunks are used for retrieval, while the parent chunks
       are used to provide context to the LLM.

    Args:
        full_text: The entire text content of the document.

    Returns:
        A tuple containing:
        - A list of the small "child" documents for the vector store.
        - An in-memory store mapping parent document IDs to the parent documents.
        - A dictionary mapping child document IDs to their parent's ID.
    """
    if not full_text:
        print("Warning: Input text for chunking is empty.")
        return [], InMemoryStore(), {}

    print("Creating parent and child chunks...")
    
    # This splitter creates the large documents that will be stored.
    parent_splitter = RecursiveCharacterTextSplitter(
        chunk_size=PARENT_CHUNK_SIZE,
        chunk_overlap=PARENT_CHUNK_OVERLAP,
    )

    # This splitter creates the small, granular chunks for retrieval.
    child_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHILD_CHUNK_SIZE,
        chunk_overlap=CHILD_CHUNK_OVERLAP,
    )

    parent_documents = parent_splitter.create_documents([full_text])
    
    docstore = InMemoryStore()
    child_documents = []
    child_to_parent_id_map = {}

    # Generate unique IDs for each parent document and add them to the store
    parent_ids = [str(uuid.uuid4()) for _ in parent_documents]
    docstore.mset(list(zip(parent_ids, parent_documents)))

    # Split each parent document into smaller child documents
    for i, p_doc in enumerate(parent_documents):
        parent_id = parent_ids[i]
        _child_docs = child_splitter.split_documents([p_doc])
        
        for _child_doc in _child_docs:
            child_id = str(uuid.uuid4())
            _child_doc.metadata["parent_id"] = parent_id
            _child_doc.metadata["child_id"] = child_id
            child_to_parent_id_map[child_id] = parent_id
        
        child_documents.extend(_child_docs)

    print(f"Created {len(parent_documents)} parent chunks and {len(child_documents)} child chunks.")
    return child_documents, docstore, child_to_parent_id_map