Spaces:

avojarot
/

HITL-KG

Sleeping

App Files Files Community

HITL-KG / src /text_utils.py

avojarot

Upload 22 files

c5880fb verified 25 days ago

raw

history blame contribute delete

5.63 kB

	"""
	Text Utilities Module

	Smart text processing for node labels and content display.
	Preserves word boundaries and handles multi-line content.
	"""

	import re
	from typing import Optional


	def smart_truncate(text: str, max_length: int = 50, suffix: str = "...") -> str:
	"""
	Truncate text at word boundaries.

	Args:
	text: Text to truncate
	max_length: Maximum length before truncation
	suffix: Suffix to add when truncated

	Returns:
	Truncated text preserving whole words
	"""
	if not text:
	return ""

	text = text.strip()

	if len(text) <= max_length:
	return text

	# Find last space before max_length
	truncate_at = max_length - len(suffix)

	# Try to break at word boundary
	last_space = text.rfind(" ", 0, truncate_at)

	if last_space > max_length * 0.5: # Only if reasonable amount preserved
	return text[:last_space].rstrip() + suffix

	# Fall back to hard truncation
	return text[:truncate_at].rstrip() + suffix


	def smart_truncate_multiline(
	text: str,
	max_lines: int = 3,
	max_line_length: int = 50
	) -> str:
	"""
	Truncate multi-line text intelligently.

	Args:
	text: Multi-line text
	max_lines: Maximum number of lines
	max_line_length: Maximum length per line

	Returns:
	Formatted multi-line text
	"""
	if not text:
	return ""

	lines = text.split("\n")
	result_lines = []

	for i, line in enumerate(lines[:max_lines]):
	truncated = smart_truncate(line.strip(), max_line_length)
	if truncated:
	result_lines.append(truncated)

	if len(lines) > max_lines:
	result_lines.append("...")

	return "\n".join(result_lines)


	def create_node_label(
	content: str,
	node_type: str = "default",
	max_length: Optional[int] = None
	) -> str:
	"""
	Create display label for a graph node.

	Different node types get different truncation limits
	to optimize readability.

	Args:
	content: Full node content
	node_type: Type of node
	max_length: Override max length

	Returns:
	Formatted label for display
	"""
	if not content:
	return "..."

	# Type-specific limits (optimized for visualization)
	type_limits = {
	"query": 45,
	"reasoning": 50,
	"hypothesis": 40,
	"conclusion": 50,
	"fact": 35,
	"evidence": 35,
	"constraint": 30,
	"ghost": 30,
	"default": 40,
	}

	limit = max_length or type_limits.get(node_type, type_limits["default"])
	return smart_truncate(content, limit)


	def extract_key_terms(text: str, max_terms: int = 5) -> list:
	"""
	Extract key terms from text for search/matching.

	Simple extraction based on word frequency and length.
	For production, consider using TF-IDF or KeyBERT.

	Args:
	text: Text to extract terms from
	max_terms: Maximum terms to return

	Returns:
	List of key terms
	"""
	if not text:
	return []

	# Clean and tokenize
	text = text.lower()
	words = re.findall(r'\b[a-z]{3,}\b', text)

	# Filter stop words (basic list)
	stop_words = {
	"the", "and", "for", "are", "but", "not", "you", "all",
	"can", "had", "her", "was", "one", "our", "out", "has",
	"his", "how", "its", "may", "new", "now", "old", "see",
	"way", "who", "boy", "did", "get", "let", "put", "say",
	"she", "too", "use", "with", "from", "have", "this", "that",
	"been", "your", "than", "they", "will", "more", "when",
	}

	words = [w for w in words if w not in stop_words]

	# Count and sort by frequency
	word_counts = {}
	for word in words:
	word_counts[word] = word_counts.get(word, 0) + 1

	sorted_words = sorted(
	word_counts.items(),
	key=lambda x: (x[1], len(x[0])), # Sort by count, then length
	reverse=True
	)

	return [word for word, _ in sorted_words[:max_terms]]


	def format_confidence(confidence: float) -> str:
	"""
	Format confidence score for display.

	Args:
	confidence: Score between 0 and 1

	Returns:
	Formatted percentage string
	"""
	if confidence < 0:
	confidence = 0
	elif confidence > 1:
	confidence = 1

	return f"{confidence:.0%}"


	def sanitize_content(text: str) -> str:
	"""
	Sanitize text content for safe display.

	Removes or escapes potentially problematic characters.

	Args:
	text: Raw text

	Returns:
	Sanitized text
	"""
	if not text:
	return ""

	# Remove control characters
	text = "".join(c for c in text if c.isprintable() or c in "\n\t")

	# Normalize whitespace
	text = re.sub(r'\s+', ' ', text)

	return text.strip()


	def highlight_terms(text: str, terms: list) -> str:
	"""
	Highlight terms in text (for search results).

	Returns text with terms wrapped in markers.
	Note: For HTML output, convert markers to <mark> tags.

	Args:
	text: Text to highlight in
	terms: Terms to highlight

	Returns:
	Text with highlighted terms
	"""
	if not text or not terms:
	return text

	result = text
	for term in terms:
	# Case-insensitive replacement with markers
	pattern = re.compile(re.escape(term), re.IGNORECASE)
	result = pattern.sub(f"{term}", result)

	return result