Spaces:

DunasAB
/

llm-chat-project

Sleeping

DunasAnastasiia

Initial commit (Xet)

7c2e31a 4 months ago

729 Bytes

	from __future__ import annotations

	from dataclasses import dataclass


	@dataclass(frozen=True)
	class Chunk:
	chunk_id: int
	source_id: str
	text: str


	def chunk_text(text: str, chunk_chars: int, overlap_chars: int) -> list[str]:
	"""
	Simple character-based chunking with overlap.
	Works for any text without requiring tokenizers.
	"""
	text = (text or "").strip()
	if not text:
	return []
	if chunk_chars <= 0:
	return [text]

	out: list[str] = []
	i = 0
	n = len(text)
	step = max(1, chunk_chars - max(0, overlap_chars))
	while i < n:
	chunk = text[i : i + chunk_chars].strip()
	if chunk:
	out.append(chunk)
	i += step
	return out