policy_rag_assistant / src /chunking.py
Harshdhsvguyt's picture
Upload 19 files
754d8d3 verified
from typing import List
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> List[str]:
"""
Split text into overlapping chunks based on word count.
Args:
text: Input text to chunk
chunk_size: Number of words per chunk
overlap: Number of overlapping words between chunks
Returns:
List of text chunks
"""
words = text.split()
chunks = []
if len(words) <= chunk_size:
return [text]
start = 0
while start < len(words):
end = start + chunk_size
chunk_words = words[start:end]
chunks.append(" ".join(chunk_words))
if end >= len(words):
break
start = end - overlap
return chunks
def chunk_documents(documents: List[dict], chunk_size: int = 500, overlap: int = 100) -> List[dict]:
"""
Chunk multiple documents while preserving metadata.
Returns:
List of dicts with 'text' and 'metadata' keys
"""
chunked_docs = []
for doc in documents:
text = doc["text"]
metadata = doc.get("metadata", {})
chunks = chunk_text(text, chunk_size, overlap)
for i, chunk in enumerate(chunks):
chunked_docs.append({
"text": chunk,
"metadata": {
**metadata,
"chunk_id": i,
"total_chunks": len(chunks)
}
})
return chunked_docs