Spaces:

mikeee
/

open-notebook

Runtime error

open-notebook / open_notebook /utils /text_utils.py

ffreemt

1st try

3793f68 about 1 month ago

5.01 kB

	"""
	Text utilities for Open Notebook.
	Extracted from main utils to avoid circular imports.
	"""

	import re
	import unicodedata
	from typing import Tuple

	# Patterns for matching thinking content in AI responses
	# Standard pattern: <think>...</think>
	THINK_PATTERN = re.compile(r"<think>(.*?)</think>", re.DOTALL)
	# Pattern for malformed output: content</think> (missing opening tag)
	THINK_PATTERN_NO_OPEN = re.compile(r"^(.*?)</think>", re.DOTALL)


	def remove_non_ascii(text: str) -> str:
	"""Remove non-ASCII characters from text."""
	return re.sub(r"[^\x00-\x7F]+", "", text)


	def remove_non_printable(text: str) -> str:
	"""Remove non-printable characters from text."""
	# Replace any special Unicode whitespace characters with a regular space
	text = re.sub(r"[\u2000-\u200B\u202F\u205F\u3000]", " ", text)

	# Replace unusual line terminators with a single newline
	text = re.sub(r"[\u2028\u2029\r]", "\n", text)

	# Remove control characters, except newlines and tabs
	text = "".join(
	char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t"
	)

	# Replace non-breaking spaces with regular spaces
	text = text.replace("\xa0", " ").strip()

	# Keep letters (including accented ones), numbers, spaces, newlines, tabs, and basic punctuation
	return re.sub(r"[^\w\s.,!?\-\n\t]", "", text, flags=re.UNICODE)


	def parse_thinking_content(content: str) -> Tuple[str, str]:
	"""
	Parse message content to extract thinking content from <think> tags.

	Handles both well-formed tags and malformed output where the opening
	<think> tag is missing but </think> is present.

	Args:
	content (str): The original message content

	Returns:
	Tuple[str, str]: (thinking_content, cleaned_content)
	- thinking_content: Content from within <think> tags
	- cleaned_content: Original content with <think> blocks removed

	Example:
	>>> content = "<think>Let me analyze this</think>Here's my answer"
	>>> thinking, cleaned = parse_thinking_content(content)
	>>> print(thinking)
	"Let me analyze this"
	>>> print(cleaned)
	"Here's my answer"
	"""
	# Input validation
	if not isinstance(content, str):
	return "", str(content) if content is not None else ""

	# Limit processing for very large content (100KB limit)
	if len(content) > 100000:
	return "", content

	# Find all well-formed thinking blocks
	thinking_matches = THINK_PATTERN.findall(content)

	if thinking_matches:
	# Join all thinking content with double newlines
	thinking_content = "\n\n".join(match.strip() for match in thinking_matches)

	# Remove all <think>...</think> blocks from the original content
	cleaned_content = THINK_PATTERN.sub("", content)

	# Clean up extra whitespace
	cleaned_content = re.sub(r"\n\s\n\s\n", "\n\n", cleaned_content).strip()

	return thinking_content, cleaned_content

	# Handle malformed output: content</think> (missing opening tag)
	# Some models like Nemotron output thinking without the opening <think> tag
	malformed_match = THINK_PATTERN_NO_OPEN.match(content)
	if malformed_match:
	thinking_content = malformed_match.group(1).strip()
	# Remove the thinking content and </think> tag
	cleaned_content = content[malformed_match.end() :].strip()
	return thinking_content, cleaned_content

	return "", content


	def clean_thinking_content(content: str) -> str:
	"""
	Remove thinking content from AI responses, returning only the cleaned content.

	This is a convenience function for cases where you only need the cleaned
	content and don't need access to the thinking process.

	Args:
	content (str): The original message content with potential <think> tags

	Returns:
	str: Content with <think> blocks removed and whitespace cleaned

	Example:
	>>> content = "<think>Let me think...</think>Here's the answer"
	>>> clean_thinking_content(content)
	"Here's the answer"
	"""
	_, cleaned_content = parse_thinking_content(content)
	return cleaned_content


	def extract_text_content(content) -> str:
	"""Extract text from LLM response content.

	Handles both plain string responses and structured content formats
	(e.g. Gemini's envelope format):
	[{'type': 'text', 'text': '...', 'extras': {...}}]

	Args:
	content: The content from an AI message, either a string or a list of parts.

	Returns:
	The extracted text content as a string.
	"""
	if isinstance(content, str):
	return content
	if isinstance(content, list):
	text_parts = []
	for part in content:
	if isinstance(part, dict) and "text" in part:
	text_parts.append(part["text"])
	elif isinstance(part, str):
	text_parts.append(part)
	return "".join(text_parts)
	return str(content)