Baktabek's picture
Upload folder using huggingface_hub
409c17a verified
"""
Application Layer - Chunking Service
Handles intelligent document chunking.
"""
import re
from typing import List
from uuid import UUID
from app.domain.entities import DocumentChunk
class ChunkingService:
"""Service for chunking documents intelligently"""
def __init__(
self,
chunk_size: int = 800,
chunk_overlap: int = 100,
min_chunk_size: int = 100,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.min_chunk_size = min_chunk_size
async def chunk_text(
self, text: str, document_id: UUID, metadata: dict = None
) -> List[DocumentChunk]:
"""Chunk text using semantic boundaries"""
if metadata is None:
metadata = {}
# 1. Split by paragraphs
paragraphs = self._split_paragraphs(text)
# 2. Combine into chunks
chunks = []
current_chunk = []
current_size = 0
for i, para in enumerate(paragraphs):
para_tokens = self._count_tokens(para)
if current_size + para_tokens > self.chunk_size and current_chunk:
# Flush current chunk
chunk_text = "\n\n".join(current_chunk)
chunks.append(chunk_text)
# Start new chunk with overlap
overlap_text = self._get_overlap(current_chunk)
current_chunk = [overlap_text, para] if overlap_text else [para]
current_size = self._count_tokens("\n\n".join(current_chunk))
else:
current_chunk.append(para)
current_size += para_tokens
# Flush remaining
if current_chunk:
chunks.append("\n\n".join(current_chunk))
# 3. Create DocumentChunk entities
return [
DocumentChunk(
document_id=document_id,
chunk_index=idx,
content=chunk,
token_count=self._count_tokens(chunk),
metadata=metadata,
)
for idx, chunk in enumerate(chunks)
if self._count_tokens(chunk) >= self.min_chunk_size
]
def _split_paragraphs(self, text: str) -> List[str]:
"""Split text into paragraphs"""
# Split by double newlines, headers, etc.
paragraphs = re.split(r"\n\s*\n", text)
return [p.strip() for p in paragraphs if p.strip()]
def _count_tokens(self, text: str) -> int:
"""Approximate token count (1 token ≈ 4 chars)"""
return len(text) // 4
def _get_overlap(self, chunks: List[str]) -> str:
"""Get overlap text from previous chunks"""
if not chunks:
return ""
# Take last chunk and truncate to overlap size
last_chunk = chunks[-1]
tokens = last_chunk.split()
overlap_tokens = int(self.chunk_overlap * 0.25) # Rough token estimate
if len(tokens) <= overlap_tokens:
return last_chunk
return " ".join(tokens[-overlap_tokens:])