financial_qa_rag / utils /chunking.py
jayyd's picture
Rename utils/chunking to utils/chunking.py
7483111 verified
# utils/chunking.py
import re
import nltk
nltk.download("punkt", quiet=True)
from nltk.tokenize import sent_tokenize
def smart_chunk_text(text, chunk_size=300, overlap=50):
# Ensure input is a string
if isinstance(text, list):
text = "\n".join(text)
# Split text into paragraphs
paragraphs = re.split(r"\n\s*\n", text)
chunks = []
for para in paragraphs:
sentences = sent_tokenize(para)
words = []
for sent in sentences:
sent_words = sent.split()
# If sentence itself is longer than chunk_size, break it
if len(sent_words) > chunk_size:
for i in range(0, len(sent_words), chunk_size - overlap):
part = " ".join(sent_words[i:i+chunk_size])
if len(part.split()) > 30:
chunks.append(part)
else:
words.extend(sent_words)
# If collected enough words, make a chunk
if len(words) >= chunk_size:
chunk = " ".join(words[:chunk_size])
chunks.append(chunk)
# Keep overlap
words = words[chunk_size - overlap:]
# Leftover words (end of paragraph)
if words and len(words) > 30:
chunks.append(" ".join(words))
return chunks