llm-chat-project / rag /chunking.py
DunasAnastasiia
Initial commit (Xet)
7c2e31a
raw
history blame contribute delete
729 Bytes
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True)
class Chunk:
chunk_id: int
source_id: str
text: str
def chunk_text(text: str, chunk_chars: int, overlap_chars: int) -> list[str]:
"""
Simple character-based chunking with overlap.
Works for any text without requiring tokenizers.
"""
text = (text or "").strip()
if not text:
return []
if chunk_chars <= 0:
return [text]
out: list[str] = []
i = 0
n = len(text)
step = max(1, chunk_chars - max(0, overlap_chars))
while i < n:
chunk = text[i : i + chunk_chars].strip()
if chunk:
out.append(chunk)
i += step
return out