""" Text utilities for Open Notebook. Extracted from main utils to avoid circular imports. """ import re import unicodedata from typing import Tuple from langchain_text_splitters import RecursiveCharacterTextSplitter from .token_utils import token_count # Patterns for matching thinking content in AI responses # Standard pattern: ... THINK_PATTERN = re.compile(r"(.*?)", re.DOTALL) # Pattern for malformed output: content (missing opening tag) THINK_PATTERN_NO_OPEN = re.compile(r"^(.*?)", re.DOTALL) def split_text(txt: str, chunk_size=500): """ Split the input text into chunks. Args: txt (str): The input text to be split. chunk_size (int): The size of each chunk. Default is 500. Returns: list: A list of text chunks. """ overlap = int(chunk_size * 0.15) text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=overlap, length_function=token_count, separators=[ "\n\n", "\n", ".", ",", " ", "\u200b", # Zero-width space "\uff0c", # Fullwidth comma "\u3001", # Ideographic comma "\uff0e", # Fullwidth full stop "\u3002", # Ideographic full stop "", ], ) return text_splitter.split_text(txt) def remove_non_ascii(text: str) -> str: """Remove non-ASCII characters from text.""" return re.sub(r"[^\x00-\x7F]+", "", text) def remove_non_printable(text: str) -> str: """Remove non-printable characters from text.""" # Replace any special Unicode whitespace characters with a regular space text = re.sub(r"[\u2000-\u200B\u202F\u205F\u3000]", " ", text) # Replace unusual line terminators with a single newline text = re.sub(r"[\u2028\u2029\r]", "\n", text) # Remove control characters, except newlines and tabs text = "".join( char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t" ) # Replace non-breaking spaces with regular spaces text = text.replace("\xa0", " ").strip() # Keep letters (including accented ones), numbers, spaces, newlines, tabs, and basic punctuation return re.sub(r"[^\w\s.,!?\-\n\t]", "", text, flags=re.UNICODE) def parse_thinking_content(content: str) -> Tuple[str, str]: """ Parse message content to extract thinking content from tags. Handles both well-formed tags and malformed output where the opening tag is missing but is present. Args: content (str): The original message content Returns: Tuple[str, str]: (thinking_content, cleaned_content) - thinking_content: Content from within tags - cleaned_content: Original content with blocks removed Example: >>> content = "Let me analyze thisHere's my answer" >>> thinking, cleaned = parse_thinking_content(content) >>> print(thinking) "Let me analyze this" >>> print(cleaned) "Here's my answer" """ # Input validation if not isinstance(content, str): return "", str(content) if content is not None else "" # Limit processing for very large content (100KB limit) if len(content) > 100000: return "", content # Find all well-formed thinking blocks thinking_matches = THINK_PATTERN.findall(content) if thinking_matches: # Join all thinking content with double newlines thinking_content = "\n\n".join(match.strip() for match in thinking_matches) # Remove all ... blocks from the original content cleaned_content = THINK_PATTERN.sub("", content) # Clean up extra whitespace cleaned_content = re.sub(r"\n\s*\n\s*\n", "\n\n", cleaned_content).strip() return thinking_content, cleaned_content # Handle malformed output: content (missing opening tag) # Some models like Nemotron output thinking without the opening tag malformed_match = THINK_PATTERN_NO_OPEN.match(content) if malformed_match: thinking_content = malformed_match.group(1).strip() # Remove the thinking content and tag cleaned_content = content[malformed_match.end():].strip() return thinking_content, cleaned_content return "", content def clean_thinking_content(content: str) -> str: """ Remove thinking content from AI responses, returning only the cleaned content. This is a convenience function for cases where you only need the cleaned content and don't need access to the thinking process. Args: content (str): The original message content with potential tags Returns: str: Content with blocks removed and whitespace cleaned Example: >>> content = "Let me think...Here's the answer" >>> clean_thinking_content(content) "Here's the answer" """ _, cleaned_content = parse_thinking_content(content) return cleaned_content