Spaces:
Running
Running
File size: 561 Bytes
f06dea6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | # text_splitter.py
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=150,
separators=["\n\n", "\n", ".", " ", ""]
)
def split_text(text):
chunks = splitter.split_text(text)
# 🔥 CLEANING STEP (VERY IMPORTANT)
cleaned_chunks = []
for chunk in chunks:
chunk = chunk.strip()
if len(chunk) > 50: # ❌ remove tiny garbage chunks
cleaned_chunks.append(chunk)
return cleaned_chunks |