Spaces:
Running on Zero
Running on Zero
| import re | |
| import uuid | |
| from app.core.models import Chunk, Document | |
| def chunk_document(document: Document, chunk_size: int, overlap: int) -> list[Chunk]: | |
| if overlap >= chunk_size: | |
| raise ValueError("Chunk overlap must be smaller than chunk size.") | |
| normalized = re.sub(r"\n{3,}", "\n\n", document.text).strip() | |
| if not normalized: | |
| raise ValueError("Document is empty after extraction.") | |
| chunks: list[Chunk] = [] | |
| start = 0 | |
| index = 0 | |
| while start < len(normalized): | |
| end = min(start + chunk_size, len(normalized)) | |
| if end < len(normalized): | |
| paragraph_break = normalized.rfind("\n\n", start, end) | |
| sentence_break = normalized.rfind(". ", start, end) | |
| best_break = max(paragraph_break, sentence_break) | |
| if best_break > start + chunk_size // 2: | |
| end = best_break + 1 | |
| text = normalized[start:end].strip() | |
| if text: | |
| digest = str(uuid.uuid5(uuid.NAMESPACE_URL, f"{document.source}:{index}:{text[:80]}")) | |
| chunks.append( | |
| Chunk( | |
| id=digest, | |
| text=text, | |
| index=index, | |
| source_type=document.source_type, | |
| source=document.source, | |
| title=document.title, | |
| metadata=document.metadata, | |
| ) | |
| ) | |
| index += 1 | |
| if end == len(normalized): | |
| break | |
| start = max(0, end - overlap) | |
| return chunks | |