Spaces:

m97j
/

pragmatic-agent

Sleeping

First codes update

69c12a2 5 months ago

1.5 kB

	# app/modules/common/utils.py
	import re


	def count_tokens(text):
	return len(text.split())

	def split_content(text: str, role: str = None, return_boundaries: bool = False):
	"""
	Split text into snippets or boundaries.
	- user: sentence-level split
	- assistant: markdown-aware split
	- plain text: sentence-level split
	usage modules: conversation/history_manager.py, data/page_crawler.py, models/llm_model.py -> refine -> _chunk_tokens_with_offsets_safe()
	"""
	snippets = []
	boundaries = []

	if role == "assistant":
	# markdown-aware split
	blocks = re.split(r'(```.?```\|\\|.?\\|.?\\|.?\\|)', text, flags=re.S)
	for b in blocks:
	if not b.strip():
	continue
	if b.startswith("```") or b.startswith("\|"):
	snippets.append(b.strip())
	boundaries.append(len(text)) # treat block as one unit
	else:
	sentences = re.split(r'(?<=[.!?])\s+\|\n+', b)
	for s in sentences:
	if s.strip():
	snippets.append(s.strip())
	boundaries.append(text.find(s) + len(s))
	else:
	# plain text or user role
	sentences = re.split(r'(?<=[.!?])\s+\|\n+', text)
	for s in sentences:
	if s.strip():
	snippets.append(s.strip())
	boundaries.append(text.find(s) + len(s))

	return boundaries if return_boundaries else snippets