Spaces:
Sleeping
Sleeping
File size: 1,730 Bytes
a34068e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | import logging
import re
logger = logging.getLogger(__name__)
SENTENCE_PATTERN = re.compile(r"(?<=[.!?])\s+")
def chunk_text(
text: str,
chunk_size: int = 512,
chunk_overlap: int = 50,
) -> list[dict]:
if not text or not text.strip():
return []
sentences = SENTENCE_PATTERN.split(text)
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
return []
chunks = []
current_words: list[str] = []
current_start = 0
char_pos = 0
for sentence in sentences:
words = sentence.split()
if current_words and len(current_words) + len(words) > chunk_size:
chunk_text_str = " ".join(current_words)
chunk_end = current_start + len(chunk_text_str)
chunks.append({
"text": chunk_text_str,
"start_char": current_start,
"end_char": chunk_end,
"chunk_index": len(chunks),
})
# Overlap: keep last chunk_overlap words
overlap_words = current_words[-chunk_overlap:] if chunk_overlap > 0 else []
overlap_text = " ".join(overlap_words)
current_start = chunk_end - len(overlap_text)
current_words = overlap_words
current_words.extend(words)
# Last chunk
if current_words:
chunk_text_str = " ".join(current_words)
chunks.append({
"text": chunk_text_str,
"start_char": current_start,
"end_char": current_start + len(chunk_text_str),
"chunk_index": len(chunks),
})
logger.info(f"Chunked text into {len(chunks)} chunks (size={chunk_size}, overlap={chunk_overlap})")
return chunks
|