from typing import List def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> List[str]: """ Split text into overlapping chunks based on word count. Args: text: Input text to chunk chunk_size: Number of words per chunk overlap: Number of overlapping words between chunks Returns: List of text chunks """ words = text.split() chunks = [] if len(words) <= chunk_size: return [text] start = 0 while start < len(words): end = start + chunk_size chunk_words = words[start:end] chunks.append(" ".join(chunk_words)) if end >= len(words): break start = end - overlap return chunks def chunk_documents(documents: List[dict], chunk_size: int = 500, overlap: int = 100) -> List[dict]: """ Chunk multiple documents while preserving metadata. Returns: List of dicts with 'text' and 'metadata' keys """ chunked_docs = [] for doc in documents: text = doc["text"] metadata = doc.get("metadata", {}) chunks = chunk_text(text, chunk_size, overlap) for i, chunk in enumerate(chunks): chunked_docs.append({ "text": chunk, "metadata": { **metadata, "chunk_id": i, "total_chunks": len(chunks) } }) return chunked_docs