github-actions
Sync from GitHub @ b179dfb
3694da1
"""Split long text into overlapping chunks for embedding.
Approximation: 1500 chars ≈ 375 tokens for English/Latin scripts. That fits
the SmolLM2 2048-token context window comfortably when retrieving 3 chunks
plus a few chat turns. For CJK or other dense scripts the chunks will be
fewer tokens per char but the embedder doesn't care.
"""
from __future__ import annotations
import re
def chunk_text(
text: str,
chunk_chars: int = 1500,
overlap_chars: int = 200,
) -> list[str]:
"""Paragraph-aware splitter with overlap.
Walk paragraphs (separated by blank lines). For each:
• If it fits in the current buffer: append.
• Else, flush the buffer (with tail-overlap into the next chunk).
• If the paragraph itself is longer than chunk_chars: hard-split it.
"""
text = re.sub(r"\r\n?", "\n", text or "")
text = re.sub(r"\n{3,}", "\n\n", text).strip()
if not text:
return []
if len(text) <= chunk_chars:
return [text]
chunks: list[str] = []
buffer = ""
for para in text.split("\n\n"):
para = para.strip()
if not para:
continue
if _can_fit(buffer, para, chunk_chars):
buffer = _append(buffer, para)
continue
# Doesn't fit — flush the buffer (if any).
if buffer:
chunks.append(buffer)
buffer = buffer[-overlap_chars:] if overlap_chars > 0 else ""
# Now try fitting again into the smaller (overlap-only) buffer.
if _can_fit(buffer, para, chunk_chars):
buffer = _append(buffer, para)
continue
# Paragraph alone exceeds chunk_chars — hard-split it.
chunks.extend(_hard_split(para, chunk_chars, overlap_chars))
buffer = ""
if buffer:
chunks.append(buffer)
return [c.strip() for c in chunks if c.strip()]
def _can_fit(buffer: str, para: str, chunk_chars: int) -> bool:
sep = 2 if buffer else 0
return len(buffer) + sep + len(para) <= chunk_chars
def _append(buffer: str, para: str) -> str:
return f"{buffer}\n\n{para}" if buffer else para
def _hard_split(text: str, chunk_chars: int, overlap_chars: int) -> list[str]:
step = max(1, chunk_chars - overlap_chars)
return [text[i : i + chunk_chars] for i in range(0, len(text), step)]