m97j's picture
First codes update
69c12a2
# app/modules/common/utils.py
import re
def count_tokens(text):
return len(text.split())
def split_content(text: str, role: str = None, return_boundaries: bool = False):
"""
Split text into snippets or boundaries.
- user: sentence-level split
- assistant: markdown-aware split
- plain text: sentence-level split
usage modules: conversation/history_manager.py, data/page_crawler.py, models/llm_model.py -> refine -> _chunk_tokens_with_offsets_safe()
"""
snippets = []
boundaries = []
if role == "assistant":
# markdown-aware split
blocks = re.split(r'(```.*?```|\|.*?\|.*?\|.*?\|)', text, flags=re.S)
for b in blocks:
if not b.strip():
continue
if b.startswith("```") or b.startswith("|"):
snippets.append(b.strip())
boundaries.append(len(text)) # treat block as one unit
else:
sentences = re.split(r'(?<=[.!?])\s+|\n+', b)
for s in sentences:
if s.strip():
snippets.append(s.strip())
boundaries.append(text.find(s) + len(s))
else:
# plain text or user role
sentences = re.split(r'(?<=[.!?])\s+|\n+', text)
for s in sentences:
if s.strip():
snippets.append(s.strip())
boundaries.append(text.find(s) + len(s))
return boundaries if return_boundaries else snippets