File size: 561 Bytes
f06dea6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# text_splitter.py

from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=150,
    separators=["\n\n", "\n", ".", " ", ""]
)

def split_text(text):
    
    chunks = splitter.split_text(text)

    # 🔥 CLEANING STEP (VERY IMPORTANT)
    cleaned_chunks = []
    for chunk in chunks:
        chunk = chunk.strip()
        if len(chunk) > 50:   # ❌ remove tiny garbage chunks
            cleaned_chunks.append(chunk)

    return cleaned_chunks