Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| class Chunk: | |
| chunk_id: int | |
| source_id: str | |
| text: str | |
| def chunk_text(text: str, chunk_chars: int, overlap_chars: int) -> list[str]: | |
| """ | |
| Simple character-based chunking with overlap. | |
| Works for any text without requiring tokenizers. | |
| """ | |
| text = (text or "").strip() | |
| if not text: | |
| return [] | |
| if chunk_chars <= 0: | |
| return [text] | |
| out: list[str] = [] | |
| i = 0 | |
| n = len(text) | |
| step = max(1, chunk_chars - max(0, overlap_chars)) | |
| while i < n: | |
| chunk = text[i : i + chunk_chars].strip() | |
| if chunk: | |
| out.append(chunk) | |
| i += step | |
| return out | |