from typing import List import tiktoken from src.config import TOKEN_ENCODING, CHUNK_TOKENS, CHUNK_OVERLAP _enc = tiktoken.get_encoding(TOKEN_ENCODING) def chunk_text(text: str, chunk_tokens: int = CHUNK_TOKENS, overlap_tokens: int = CHUNK_OVERLAP) -> List[str]: tokens = _enc.encode(text) chunks = [] start = 0 while start < len(tokens): end = min(start + chunk_tokens, len(tokens)) chunk = _enc.decode(tokens[start:end]).strip() if chunk: chunks.append(chunk) start = end - overlap_tokens if start < 0: start = 0 if end == len(tokens): break return chunks