Spaces:
Running
Running
| """ | |
| Text utilities for Open Notebook. | |
| Extracted from main utils to avoid circular imports. | |
| """ | |
| import re | |
| import unicodedata | |
| from typing import Tuple | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from .token_utils import token_count | |
| # Patterns for matching thinking content in AI responses | |
| # Standard pattern: <think>...</think> | |
| THINK_PATTERN = re.compile(r"<think>(.*?)</think>", re.DOTALL) | |
| # Pattern for malformed output: content</think> (missing opening tag) | |
| THINK_PATTERN_NO_OPEN = re.compile(r"^(.*?)</think>", re.DOTALL) | |
| def split_text(txt: str, chunk_size=500): | |
| """ | |
| Split the input text into chunks. | |
| Args: | |
| txt (str): The input text to be split. | |
| chunk_size (int): The size of each chunk. Default is 500. | |
| Returns: | |
| list: A list of text chunks. | |
| """ | |
| overlap = int(chunk_size * 0.15) | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=overlap, | |
| length_function=token_count, | |
| separators=[ | |
| "\n\n", | |
| "\n", | |
| ".", | |
| ",", | |
| " ", | |
| "\u200b", # Zero-width space | |
| "\uff0c", # Fullwidth comma | |
| "\u3001", # Ideographic comma | |
| "\uff0e", # Fullwidth full stop | |
| "\u3002", # Ideographic full stop | |
| "", | |
| ], | |
| ) | |
| return text_splitter.split_text(txt) | |
| def remove_non_ascii(text: str) -> str: | |
| """Remove non-ASCII characters from text.""" | |
| return re.sub(r"[^\x00-\x7F]+", "", text) | |
| def remove_non_printable(text: str) -> str: | |
| """Remove non-printable characters from text.""" | |
| # Replace any special Unicode whitespace characters with a regular space | |
| text = re.sub(r"[\u2000-\u200B\u202F\u205F\u3000]", " ", text) | |
| # Replace unusual line terminators with a single newline | |
| text = re.sub(r"[\u2028\u2029\r]", "\n", text) | |
| # Remove control characters, except newlines and tabs | |
| text = "".join( | |
| char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t" | |
| ) | |
| # Replace non-breaking spaces with regular spaces | |
| text = text.replace("\xa0", " ").strip() | |
| # Keep letters (including accented ones), numbers, spaces, newlines, tabs, and basic punctuation | |
| return re.sub(r"[^\w\s.,!?\-\n\t]", "", text, flags=re.UNICODE) | |
| def parse_thinking_content(content: str) -> Tuple[str, str]: | |
| """ | |
| Parse message content to extract thinking content from <think> tags. | |
| Handles both well-formed tags and malformed output where the opening | |
| <think> tag is missing but </think> is present. | |
| Args: | |
| content (str): The original message content | |
| Returns: | |
| Tuple[str, str]: (thinking_content, cleaned_content) | |
| - thinking_content: Content from within <think> tags | |
| - cleaned_content: Original content with <think> blocks removed | |
| Example: | |
| >>> content = "<think>Let me analyze this</think>Here's my answer" | |
| >>> thinking, cleaned = parse_thinking_content(content) | |
| >>> print(thinking) | |
| "Let me analyze this" | |
| >>> print(cleaned) | |
| "Here's my answer" | |
| """ | |
| # Input validation | |
| if not isinstance(content, str): | |
| return "", str(content) if content is not None else "" | |
| # Limit processing for very large content (100KB limit) | |
| if len(content) > 100000: | |
| return "", content | |
| # Find all well-formed thinking blocks | |
| thinking_matches = THINK_PATTERN.findall(content) | |
| if thinking_matches: | |
| # Join all thinking content with double newlines | |
| thinking_content = "\n\n".join(match.strip() for match in thinking_matches) | |
| # Remove all <think>...</think> blocks from the original content | |
| cleaned_content = THINK_PATTERN.sub("", content) | |
| # Clean up extra whitespace | |
| cleaned_content = re.sub(r"\n\s*\n\s*\n", "\n\n", cleaned_content).strip() | |
| return thinking_content, cleaned_content | |
| # Handle malformed output: content</think> (missing opening tag) | |
| # Some models like Nemotron output thinking without the opening <think> tag | |
| malformed_match = THINK_PATTERN_NO_OPEN.match(content) | |
| if malformed_match: | |
| thinking_content = malformed_match.group(1).strip() | |
| # Remove the thinking content and </think> tag | |
| cleaned_content = content[malformed_match.end():].strip() | |
| return thinking_content, cleaned_content | |
| return "", content | |
| def clean_thinking_content(content: str) -> str: | |
| """ | |
| Remove thinking content from AI responses, returning only the cleaned content. | |
| This is a convenience function for cases where you only need the cleaned | |
| content and don't need access to the thinking process. | |
| Args: | |
| content (str): The original message content with potential <think> tags | |
| Returns: | |
| str: Content with <think> blocks removed and whitespace cleaned | |
| Example: | |
| >>> content = "<think>Let me think...</think>Here's the answer" | |
| >>> clean_thinking_content(content) | |
| "Here's the answer" | |
| """ | |
| _, cleaned_content = parse_thinking_content(content) | |
| return cleaned_content | |