import re import os from typing import List def split_text_into_chunks(text: str, max_chars: int = 256) -> List[str]: """ Split raw text into chunks no longer than max_chars. """ # 1. First split by newlines - each line/paragraph is handled independently paragraphs = re.split(r"[\r\n]+", text.strip()) final_chunks = [] for para in paragraphs: para = para.strip() if not para: continue # 2. Split current paragraph into sentences sentences = re.split(r"(?<=[\.\!\?\…])\s+", para) buffer = "" for sentence in sentences: sentence = sentence.strip() if not sentence: continue # If sentence itself is longer than max_chars, we must split it by minor punctuation or words if len(sentence) > max_chars: # Flush buffer before handling a giant sentence if buffer: final_chunks.append(buffer) buffer = "" # Split giant sentence by minor punctuation (, ; : -) sub_parts = re.split(r"(?<=[\,\;\:\-\–\—])\s+", sentence) for part in sub_parts: part = part.strip() if not part: continue if len(buffer) + 1 + len(part) <= max_chars: buffer = (buffer + " " + part) if buffer else part else: if buffer: final_chunks.append(buffer) buffer = part # If even a sub-part is too long, split by spaces (words) if len(buffer) > max_chars: words = buffer.split() current = "" for word in words: if current and len(current) + 1 + len(word) > max_chars: final_chunks.append(current) current = word else: current = (current + " " + word) if current else word buffer = current else: # Normal sentence: check if it fits in current buffer if buffer and len(buffer) + 1 + len(sentence) > max_chars: final_chunks.append(buffer) buffer = sentence else: buffer = (buffer + " " + sentence) if buffer else sentence # End of paragraph: flush whatever is in buffer if buffer: final_chunks.append(buffer) buffer = "" return [c.strip() for c in final_chunks if c.strip()] def env_bool(name: str, default: bool = False) -> bool: v = os.getenv(name) if v is None: return default return v.strip().lower() in ("1", "true", "yes", "y", "on")