baveshraam's picture
FIX: SurrealDB 2.0 migration syntax and Frontend/CORS link
f871fed
"""
Text utilities for Open Notebook.
Extracted from main utils to avoid circular imports.
"""
import re
import unicodedata
from typing import Tuple
from langchain_text_splitters import RecursiveCharacterTextSplitter
from .token_utils import token_count
# Patterns for matching thinking content in AI responses
# Standard pattern: <think>...</think>
THINK_PATTERN = re.compile(r"<think>(.*?)</think>", re.DOTALL)
# Pattern for malformed output: content</think> (missing opening tag)
THINK_PATTERN_NO_OPEN = re.compile(r"^(.*?)</think>", re.DOTALL)
def split_text(txt: str, chunk_size=500):
"""
Split the input text into chunks.
Args:
txt (str): The input text to be split.
chunk_size (int): The size of each chunk. Default is 500.
Returns:
list: A list of text chunks.
"""
overlap = int(chunk_size * 0.15)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
length_function=token_count,
separators=[
"\n\n",
"\n",
".",
",",
" ",
"\u200b", # Zero-width space
"\uff0c", # Fullwidth comma
"\u3001", # Ideographic comma
"\uff0e", # Fullwidth full stop
"\u3002", # Ideographic full stop
"",
],
)
return text_splitter.split_text(txt)
def remove_non_ascii(text: str) -> str:
"""Remove non-ASCII characters from text."""
return re.sub(r"[^\x00-\x7F]+", "", text)
def remove_non_printable(text: str) -> str:
"""Remove non-printable characters from text."""
# Replace any special Unicode whitespace characters with a regular space
text = re.sub(r"[\u2000-\u200B\u202F\u205F\u3000]", " ", text)
# Replace unusual line terminators with a single newline
text = re.sub(r"[\u2028\u2029\r]", "\n", text)
# Remove control characters, except newlines and tabs
text = "".join(
char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t"
)
# Replace non-breaking spaces with regular spaces
text = text.replace("\xa0", " ").strip()
# Keep letters (including accented ones), numbers, spaces, newlines, tabs, and basic punctuation
return re.sub(r"[^\w\s.,!?\-\n\t]", "", text, flags=re.UNICODE)
def parse_thinking_content(content: str) -> Tuple[str, str]:
"""
Parse message content to extract thinking content from <think> tags.
Handles both well-formed tags and malformed output where the opening
<think> tag is missing but </think> is present.
Args:
content (str): The original message content
Returns:
Tuple[str, str]: (thinking_content, cleaned_content)
- thinking_content: Content from within <think> tags
- cleaned_content: Original content with <think> blocks removed
Example:
>>> content = "<think>Let me analyze this</think>Here's my answer"
>>> thinking, cleaned = parse_thinking_content(content)
>>> print(thinking)
"Let me analyze this"
>>> print(cleaned)
"Here's my answer"
"""
# Input validation
if not isinstance(content, str):
return "", str(content) if content is not None else ""
# Limit processing for very large content (100KB limit)
if len(content) > 100000:
return "", content
# Find all well-formed thinking blocks
thinking_matches = THINK_PATTERN.findall(content)
if thinking_matches:
# Join all thinking content with double newlines
thinking_content = "\n\n".join(match.strip() for match in thinking_matches)
# Remove all <think>...</think> blocks from the original content
cleaned_content = THINK_PATTERN.sub("", content)
# Clean up extra whitespace
cleaned_content = re.sub(r"\n\s*\n\s*\n", "\n\n", cleaned_content).strip()
return thinking_content, cleaned_content
# Handle malformed output: content</think> (missing opening tag)
# Some models like Nemotron output thinking without the opening <think> tag
malformed_match = THINK_PATTERN_NO_OPEN.match(content)
if malformed_match:
thinking_content = malformed_match.group(1).strip()
# Remove the thinking content and </think> tag
cleaned_content = content[malformed_match.end():].strip()
return thinking_content, cleaned_content
return "", content
def clean_thinking_content(content: str) -> str:
"""
Remove thinking content from AI responses, returning only the cleaned content.
This is a convenience function for cases where you only need the cleaned
content and don't need access to the thinking process.
Args:
content (str): The original message content with potential <think> tags
Returns:
str: Content with <think> blocks removed and whitespace cleaned
Example:
>>> content = "<think>Let me think...</think>Here's the answer"
>>> clean_thinking_content(content)
"Here's the answer"
"""
_, cleaned_content = parse_thinking_content(content)
return cleaned_content