# app/modules/common/utils.py import re def count_tokens(text): return len(text.split()) def split_content(text: str, role: str = None, return_boundaries: bool = False): """ Split text into snippets or boundaries. - user: sentence-level split - assistant: markdown-aware split - plain text: sentence-level split usage modules: conversation/history_manager.py, data/page_crawler.py, models/llm_model.py -> refine -> _chunk_tokens_with_offsets_safe() """ snippets = [] boundaries = [] if role == "assistant": # markdown-aware split blocks = re.split(r'(```.*?```|\|.*?\|.*?\|.*?\|)', text, flags=re.S) for b in blocks: if not b.strip(): continue if b.startswith("```") or b.startswith("|"): snippets.append(b.strip()) boundaries.append(len(text)) # treat block as one unit else: sentences = re.split(r'(?<=[.!?])\s+|\n+', b) for s in sentences: if s.strip(): snippets.append(s.strip()) boundaries.append(text.find(s) + len(s)) else: # plain text or user role sentences = re.split(r'(?<=[.!?])\s+|\n+', text) for s in sentences: if s.strip(): snippets.append(s.strip()) boundaries.append(text.find(s) + len(s)) return boundaries if return_boundaries else snippets