|
|
"""Text splitter for chunking documents.""" |
|
|
|
|
|
from dataclasses import dataclass |
|
|
from typing import List, Optional |
|
|
|
|
|
from src.config import ChunkingConfig |
|
|
from src.document_loader.loader import Document |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class TextChunk: |
|
|
"""Represents a chunk of text.""" |
|
|
content: str |
|
|
metadata: dict |
|
|
chunk_index: int |
|
|
|
|
|
|
|
|
class TextSplitter: |
|
|
"""Split text into overlapping chunks.""" |
|
|
|
|
|
def __init__(self, config: Optional[ChunkingConfig] = None): |
|
|
"""Initialize the text splitter. |
|
|
|
|
|
Args: |
|
|
config: Chunking configuration. Uses defaults if not provided. |
|
|
""" |
|
|
self.config = config or ChunkingConfig() |
|
|
|
|
|
def split_text(self, text: str, metadata: Optional[dict] = None) -> List[TextChunk]: |
|
|
"""Split text into chunks. |
|
|
|
|
|
Args: |
|
|
text: Text to split. |
|
|
metadata: Optional metadata to attach to chunks. |
|
|
|
|
|
Returns: |
|
|
List of text chunks. |
|
|
""" |
|
|
if not text.strip(): |
|
|
return [] |
|
|
|
|
|
metadata = metadata or {} |
|
|
chunks = [] |
|
|
|
|
|
|
|
|
text = text.replace("\r\n", "\n") |
|
|
|
|
|
start = 0 |
|
|
chunk_index = 0 |
|
|
|
|
|
while start < len(text): |
|
|
|
|
|
end = start + self.config.chunk_size |
|
|
|
|
|
|
|
|
if end < len(text): |
|
|
|
|
|
for sep in ["\n\n", "\n", ". ", "! ", "? "]: |
|
|
last_sep = text.rfind(sep, start, end) |
|
|
if last_sep > start: |
|
|
end = last_sep + len(sep) |
|
|
break |
|
|
else: |
|
|
end = len(text) |
|
|
|
|
|
chunk_text = text[start:end].strip() |
|
|
|
|
|
if chunk_text: |
|
|
chunks.append(TextChunk( |
|
|
content=chunk_text, |
|
|
metadata={ |
|
|
**metadata, |
|
|
"chunk_index": chunk_index, |
|
|
"start_char": start, |
|
|
"end_char": end |
|
|
}, |
|
|
chunk_index=chunk_index |
|
|
)) |
|
|
chunk_index += 1 |
|
|
|
|
|
|
|
|
start = end - self.config.chunk_overlap |
|
|
if start <= chunks[-1].metadata.get("start_char", 0) if chunks else 0: |
|
|
start = end |
|
|
|
|
|
return chunks |
|
|
|
|
|
def split_documents(self, documents: List[Document]) -> List[TextChunk]: |
|
|
"""Split multiple documents into chunks. |
|
|
|
|
|
Args: |
|
|
documents: List of documents to split. |
|
|
|
|
|
Returns: |
|
|
List of text chunks from all documents. |
|
|
""" |
|
|
all_chunks = [] |
|
|
|
|
|
for doc in documents: |
|
|
chunks = self.split_text(doc.content, doc.metadata) |
|
|
all_chunks.extend(chunks) |
|
|
|
|
|
return all_chunks |
|
|
|