KnowledgeMesh / app /services /chunking.py
pkheria's picture
psuhing to git
b5e0c74
Raw
History Blame Contribute Delete
1.54 kB
import re
import uuid
from app.core.models import Chunk, Document
def chunk_document(document: Document, chunk_size: int, overlap: int) -> list[Chunk]:
if overlap >= chunk_size:
raise ValueError("Chunk overlap must be smaller than chunk size.")
normalized = re.sub(r"\n{3,}", "\n\n", document.text).strip()
if not normalized:
raise ValueError("Document is empty after extraction.")
chunks: list[Chunk] = []
start = 0
index = 0
while start < len(normalized):
end = min(start + chunk_size, len(normalized))
if end < len(normalized):
paragraph_break = normalized.rfind("\n\n", start, end)
sentence_break = normalized.rfind(". ", start, end)
best_break = max(paragraph_break, sentence_break)
if best_break > start + chunk_size // 2:
end = best_break + 1
text = normalized[start:end].strip()
if text:
digest = str(uuid.uuid5(uuid.NAMESPACE_URL, f"{document.source}:{index}:{text[:80]}"))
chunks.append(
Chunk(
id=digest,
text=text,
index=index,
source_type=document.source_type,
source=document.source,
title=document.title,
metadata=document.metadata,
)
)
index += 1
if end == len(normalized):
break
start = max(0, end - overlap)
return chunks