File size: 471 Bytes
4d9fcca
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
def chunk_text(text: str, chunk_size : int = 200, overlap: int = 50) -> str:
    """

    Here, we will break the text into overlapping chunks and then feed

    them to the embedding pipeline

    """
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        curr_chunk = text[start:end]
        chunks.append(curr_chunk)
        start += chunk_size - overlap # we need some overlap between the chunks
    return chunks