Hoanglinhn01
/

Ttd

Model card Files Files and versions

Ttd / utils /core_utils.py

Hoanglinhn01's picture

Upload 5 files

6921612 verified 28 days ago

history blame contribute delete

3.06 kB

	import re
	import os
	from typing import List

	def split_text_into_chunks(text: str, max_chars: int = 256) -> List[str]:
	"""
	Split raw text into chunks no longer than max_chars.
	"""
	# 1. First split by newlines - each line/paragraph is handled independently
	paragraphs = re.split(r"[\r\n]+", text.strip())
	final_chunks = []

	for para in paragraphs:
	para = para.strip()
	if not para:
	continue

	# 2. Split current paragraph into sentences
	sentences = re.split(r"(?<=[\.\!\?\…])\s+", para)

	buffer = ""
	for sentence in sentences:
	sentence = sentence.strip()
	if not sentence:
	continue

	# If sentence itself is longer than max_chars, we must split it by minor punctuation or words
	if len(sentence) > max_chars:
	# Flush buffer before handling a giant sentence
	if buffer:
	final_chunks.append(buffer)
	buffer = ""

	# Split giant sentence by minor punctuation (, ; : -)
	sub_parts = re.split(r"(?<=[\,\;\:\-\–\—])\s+", sentence)
	for part in sub_parts:
	part = part.strip()
	if not part: continue

	if len(buffer) + 1 + len(part) <= max_chars:
	buffer = (buffer + " " + part) if buffer else part
	else:
	if buffer: final_chunks.append(buffer)
	buffer = part

	# If even a sub-part is too long, split by spaces (words)
	if len(buffer) > max_chars:
	words = buffer.split()
	current = ""
	for word in words:
	if current and len(current) + 1 + len(word) > max_chars:
	final_chunks.append(current)
	current = word
	else:
	current = (current + " " + word) if current else word
	buffer = current
	else:
	# Normal sentence: check if it fits in current buffer
	if buffer and len(buffer) + 1 + len(sentence) > max_chars:
	final_chunks.append(buffer)
	buffer = sentence
	else:
	buffer = (buffer + " " + sentence) if buffer else sentence

	# End of paragraph: flush whatever is in buffer
	if buffer:
	final_chunks.append(buffer)
	buffer = ""

	return [c.strip() for c in final_chunks if c.strip()]

	def env_bool(name: str, default: bool = False) -> bool:
	v = os.getenv(name)
	if v is None:
	return default
	return v.strip().lower() in ("1", "true", "yes", "y", "on")