Spaces:

baveshraam
/

open-notebook

Running

App Files Files Community

open-notebook / open_notebook /utils /text_utils.py

baveshraam

FIX: SurrealDB 2.0 migration syntax and Frontend/CORS link

f871fed 2 days ago

raw

history blame contribute delete

5.17 kB

	"""
	Text utilities for Open Notebook.
	Extracted from main utils to avoid circular imports.
	"""

	import re
	import unicodedata
	from typing import Tuple

	from langchain_text_splitters import RecursiveCharacterTextSplitter

	from .token_utils import token_count

	# Patterns for matching thinking content in AI responses
	# Standard pattern: <think>...</think>
	THINK_PATTERN = re.compile(r"<think>(.*?)</think>", re.DOTALL)
	# Pattern for malformed output: content</think> (missing opening tag)
	THINK_PATTERN_NO_OPEN = re.compile(r"^(.*?)</think>", re.DOTALL)


	def split_text(txt: str, chunk_size=500):
	"""
	Split the input text into chunks.

	Args:
	txt (str): The input text to be split.
	chunk_size (int): The size of each chunk. Default is 500.

	Returns:
	list: A list of text chunks.
	"""
	overlap = int(chunk_size * 0.15)
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=overlap,
	length_function=token_count,
	separators=[
	"\n\n",
	"\n",
	".",
	",",
	" ",
	"\u200b", # Zero-width space
	"\uff0c", # Fullwidth comma
	"\u3001", # Ideographic comma
	"\uff0e", # Fullwidth full stop
	"\u3002", # Ideographic full stop
	"",
	],
	)
	return text_splitter.split_text(txt)


	def remove_non_ascii(text: str) -> str:
	"""Remove non-ASCII characters from text."""
	return re.sub(r"[^\x00-\x7F]+", "", text)


	def remove_non_printable(text: str) -> str:
	"""Remove non-printable characters from text."""
	# Replace any special Unicode whitespace characters with a regular space
	text = re.sub(r"[\u2000-\u200B\u202F\u205F\u3000]", " ", text)

	# Replace unusual line terminators with a single newline
	text = re.sub(r"[\u2028\u2029\r]", "\n", text)

	# Remove control characters, except newlines and tabs
	text = "".join(
	char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t"
	)

	# Replace non-breaking spaces with regular spaces
	text = text.replace("\xa0", " ").strip()

	# Keep letters (including accented ones), numbers, spaces, newlines, tabs, and basic punctuation
	return re.sub(r"[^\w\s.,!?\-\n\t]", "", text, flags=re.UNICODE)


	def parse_thinking_content(content: str) -> Tuple[str, str]:
	"""
	Parse message content to extract thinking content from <think> tags.

	Handles both well-formed tags and malformed output where the opening
	<think> tag is missing but </think> is present.

	Args:
	content (str): The original message content

	Returns:
	Tuple[str, str]: (thinking_content, cleaned_content)
	- thinking_content: Content from within <think> tags
	- cleaned_content: Original content with <think> blocks removed

	Example:
	>>> content = "<think>Let me analyze this</think>Here's my answer"
	>>> thinking, cleaned = parse_thinking_content(content)
	>>> print(thinking)
	"Let me analyze this"
	>>> print(cleaned)
	"Here's my answer"
	"""
	# Input validation
	if not isinstance(content, str):
	return "", str(content) if content is not None else ""

	# Limit processing for very large content (100KB limit)
	if len(content) > 100000:
	return "", content

	# Find all well-formed thinking blocks
	thinking_matches = THINK_PATTERN.findall(content)

	if thinking_matches:
	# Join all thinking content with double newlines
	thinking_content = "\n\n".join(match.strip() for match in thinking_matches)

	# Remove all <think>...</think> blocks from the original content
	cleaned_content = THINK_PATTERN.sub("", content)

	# Clean up extra whitespace
	cleaned_content = re.sub(r"\n\s\n\s\n", "\n\n", cleaned_content).strip()

	return thinking_content, cleaned_content

	# Handle malformed output: content</think> (missing opening tag)
	# Some models like Nemotron output thinking without the opening <think> tag
	malformed_match = THINK_PATTERN_NO_OPEN.match(content)
	if malformed_match:
	thinking_content = malformed_match.group(1).strip()
	# Remove the thinking content and </think> tag
	cleaned_content = content[malformed_match.end():].strip()
	return thinking_content, cleaned_content

	return "", content


	def clean_thinking_content(content: str) -> str:
	"""
	Remove thinking content from AI responses, returning only the cleaned content.

	This is a convenience function for cases where you only need the cleaned
	content and don't need access to the thinking process.

	Args:
	content (str): The original message content with potential <think> tags

	Returns:
	str: Content with <think> blocks removed and whitespace cleaned

	Example:
	>>> content = "<think>Let me think...</think>Here's the answer"
	>>> clean_thinking_content(content)
	"Here's the answer"
	"""
	_, cleaned_content = parse_thinking_content(content)
	return cleaned_content