""" Application Layer - Chunking Service Handles intelligent document chunking. """ import re from typing import List from uuid import UUID from app.domain.entities import DocumentChunk class ChunkingService: """Service for chunking documents intelligently""" def __init__( self, chunk_size: int = 800, chunk_overlap: int = 100, min_chunk_size: int = 100, ): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.min_chunk_size = min_chunk_size async def chunk_text( self, text: str, document_id: UUID, metadata: dict = None ) -> List[DocumentChunk]: """Chunk text using semantic boundaries""" if metadata is None: metadata = {} # 1. Split by paragraphs paragraphs = self._split_paragraphs(text) # 2. Combine into chunks chunks = [] current_chunk = [] current_size = 0 for i, para in enumerate(paragraphs): para_tokens = self._count_tokens(para) if current_size + para_tokens > self.chunk_size and current_chunk: # Flush current chunk chunk_text = "\n\n".join(current_chunk) chunks.append(chunk_text) # Start new chunk with overlap overlap_text = self._get_overlap(current_chunk) current_chunk = [overlap_text, para] if overlap_text else [para] current_size = self._count_tokens("\n\n".join(current_chunk)) else: current_chunk.append(para) current_size += para_tokens # Flush remaining if current_chunk: chunks.append("\n\n".join(current_chunk)) # 3. Create DocumentChunk entities return [ DocumentChunk( document_id=document_id, chunk_index=idx, content=chunk, token_count=self._count_tokens(chunk), metadata=metadata, ) for idx, chunk in enumerate(chunks) if self._count_tokens(chunk) >= self.min_chunk_size ] def _split_paragraphs(self, text: str) -> List[str]: """Split text into paragraphs""" # Split by double newlines, headers, etc. paragraphs = re.split(r"\n\s*\n", text) return [p.strip() for p in paragraphs if p.strip()] def _count_tokens(self, text: str) -> int: """Approximate token count (1 token ≈ 4 chars)""" return len(text) // 4 def _get_overlap(self, chunks: List[str]) -> str: """Get overlap text from previous chunks""" if not chunks: return "" # Take last chunk and truncate to overlap size last_chunk = chunks[-1] tokens = last_chunk.split() overlap_tokens = int(self.chunk_overlap * 0.25) # Rough token estimate if len(tokens) <= overlap_tokens: return last_chunk return " ".join(tokens[-overlap_tokens:])