Spaces:
Running
Running
| """ | |
| Application Layer - Chunking Service | |
| Handles intelligent document chunking. | |
| """ | |
| import re | |
| from typing import List | |
| from uuid import UUID | |
| from app.domain.entities import DocumentChunk | |
| class ChunkingService: | |
| """Service for chunking documents intelligently""" | |
| def __init__( | |
| self, | |
| chunk_size: int = 800, | |
| chunk_overlap: int = 100, | |
| min_chunk_size: int = 100, | |
| ): | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| self.min_chunk_size = min_chunk_size | |
| async def chunk_text( | |
| self, text: str, document_id: UUID, metadata: dict = None | |
| ) -> List[DocumentChunk]: | |
| """Chunk text using semantic boundaries""" | |
| if metadata is None: | |
| metadata = {} | |
| # 1. Split by paragraphs | |
| paragraphs = self._split_paragraphs(text) | |
| # 2. Combine into chunks | |
| chunks = [] | |
| current_chunk = [] | |
| current_size = 0 | |
| for i, para in enumerate(paragraphs): | |
| para_tokens = self._count_tokens(para) | |
| if current_size + para_tokens > self.chunk_size and current_chunk: | |
| # Flush current chunk | |
| chunk_text = "\n\n".join(current_chunk) | |
| chunks.append(chunk_text) | |
| # Start new chunk with overlap | |
| overlap_text = self._get_overlap(current_chunk) | |
| current_chunk = [overlap_text, para] if overlap_text else [para] | |
| current_size = self._count_tokens("\n\n".join(current_chunk)) | |
| else: | |
| current_chunk.append(para) | |
| current_size += para_tokens | |
| # Flush remaining | |
| if current_chunk: | |
| chunks.append("\n\n".join(current_chunk)) | |
| # 3. Create DocumentChunk entities | |
| return [ | |
| DocumentChunk( | |
| document_id=document_id, | |
| chunk_index=idx, | |
| content=chunk, | |
| token_count=self._count_tokens(chunk), | |
| metadata=metadata, | |
| ) | |
| for idx, chunk in enumerate(chunks) | |
| if self._count_tokens(chunk) >= self.min_chunk_size | |
| ] | |
| def _split_paragraphs(self, text: str) -> List[str]: | |
| """Split text into paragraphs""" | |
| # Split by double newlines, headers, etc. | |
| paragraphs = re.split(r"\n\s*\n", text) | |
| return [p.strip() for p in paragraphs if p.strip()] | |
| def _count_tokens(self, text: str) -> int: | |
| """Approximate token count (1 token ≈ 4 chars)""" | |
| return len(text) // 4 | |
| def _get_overlap(self, chunks: List[str]) -> str: | |
| """Get overlap text from previous chunks""" | |
| if not chunks: | |
| return "" | |
| # Take last chunk and truncate to overlap size | |
| last_chunk = chunks[-1] | |
| tokens = last_chunk.split() | |
| overlap_tokens = int(self.chunk_overlap * 0.25) # Rough token estimate | |
| if len(tokens) <= overlap_tokens: | |
| return last_chunk | |
| return " ".join(tokens[-overlap_tokens:]) | |