Spaces:
Running on Zero
Running on Zero
File size: 1,540 Bytes
b5e0c74 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | import re
import uuid
from app.core.models import Chunk, Document
def chunk_document(document: Document, chunk_size: int, overlap: int) -> list[Chunk]:
if overlap >= chunk_size:
raise ValueError("Chunk overlap must be smaller than chunk size.")
normalized = re.sub(r"\n{3,}", "\n\n", document.text).strip()
if not normalized:
raise ValueError("Document is empty after extraction.")
chunks: list[Chunk] = []
start = 0
index = 0
while start < len(normalized):
end = min(start + chunk_size, len(normalized))
if end < len(normalized):
paragraph_break = normalized.rfind("\n\n", start, end)
sentence_break = normalized.rfind(". ", start, end)
best_break = max(paragraph_break, sentence_break)
if best_break > start + chunk_size // 2:
end = best_break + 1
text = normalized[start:end].strip()
if text:
digest = str(uuid.uuid5(uuid.NAMESPACE_URL, f"{document.source}:{index}:{text[:80]}"))
chunks.append(
Chunk(
id=digest,
text=text,
index=index,
source_type=document.source_type,
source=document.source,
title=document.title,
metadata=document.metadata,
)
)
index += 1
if end == len(normalized):
break
start = max(0, end - overlap)
return chunks
|