HITL-KG / src /text_utils.py
avojarot's picture
Upload 22 files
c5880fb verified
"""
Text Utilities Module
Smart text processing for node labels and content display.
Preserves word boundaries and handles multi-line content.
"""
import re
from typing import Optional
def smart_truncate(text: str, max_length: int = 50, suffix: str = "...") -> str:
"""
Truncate text at word boundaries.
Args:
text: Text to truncate
max_length: Maximum length before truncation
suffix: Suffix to add when truncated
Returns:
Truncated text preserving whole words
"""
if not text:
return ""
text = text.strip()
if len(text) <= max_length:
return text
# Find last space before max_length
truncate_at = max_length - len(suffix)
# Try to break at word boundary
last_space = text.rfind(" ", 0, truncate_at)
if last_space > max_length * 0.5: # Only if reasonable amount preserved
return text[:last_space].rstrip() + suffix
# Fall back to hard truncation
return text[:truncate_at].rstrip() + suffix
def smart_truncate_multiline(
text: str,
max_lines: int = 3,
max_line_length: int = 50
) -> str:
"""
Truncate multi-line text intelligently.
Args:
text: Multi-line text
max_lines: Maximum number of lines
max_line_length: Maximum length per line
Returns:
Formatted multi-line text
"""
if not text:
return ""
lines = text.split("\n")
result_lines = []
for i, line in enumerate(lines[:max_lines]):
truncated = smart_truncate(line.strip(), max_line_length)
if truncated:
result_lines.append(truncated)
if len(lines) > max_lines:
result_lines.append("...")
return "\n".join(result_lines)
def create_node_label(
content: str,
node_type: str = "default",
max_length: Optional[int] = None
) -> str:
"""
Create display label for a graph node.
Different node types get different truncation limits
to optimize readability.
Args:
content: Full node content
node_type: Type of node
max_length: Override max length
Returns:
Formatted label for display
"""
if not content:
return "..."
# Type-specific limits (optimized for visualization)
type_limits = {
"query": 45,
"reasoning": 50,
"hypothesis": 40,
"conclusion": 50,
"fact": 35,
"evidence": 35,
"constraint": 30,
"ghost": 30,
"default": 40,
}
limit = max_length or type_limits.get(node_type, type_limits["default"])
return smart_truncate(content, limit)
def extract_key_terms(text: str, max_terms: int = 5) -> list:
"""
Extract key terms from text for search/matching.
Simple extraction based on word frequency and length.
For production, consider using TF-IDF or KeyBERT.
Args:
text: Text to extract terms from
max_terms: Maximum terms to return
Returns:
List of key terms
"""
if not text:
return []
# Clean and tokenize
text = text.lower()
words = re.findall(r'\b[a-z]{3,}\b', text)
# Filter stop words (basic list)
stop_words = {
"the", "and", "for", "are", "but", "not", "you", "all",
"can", "had", "her", "was", "one", "our", "out", "has",
"his", "how", "its", "may", "new", "now", "old", "see",
"way", "who", "boy", "did", "get", "let", "put", "say",
"she", "too", "use", "with", "from", "have", "this", "that",
"been", "your", "than", "they", "will", "more", "when",
}
words = [w for w in words if w not in stop_words]
# Count and sort by frequency
word_counts = {}
for word in words:
word_counts[word] = word_counts.get(word, 0) + 1
sorted_words = sorted(
word_counts.items(),
key=lambda x: (x[1], len(x[0])), # Sort by count, then length
reverse=True
)
return [word for word, _ in sorted_words[:max_terms]]
def format_confidence(confidence: float) -> str:
"""
Format confidence score for display.
Args:
confidence: Score between 0 and 1
Returns:
Formatted percentage string
"""
if confidence < 0:
confidence = 0
elif confidence > 1:
confidence = 1
return f"{confidence:.0%}"
def sanitize_content(text: str) -> str:
"""
Sanitize text content for safe display.
Removes or escapes potentially problematic characters.
Args:
text: Raw text
Returns:
Sanitized text
"""
if not text:
return ""
# Remove control characters
text = "".join(c for c in text if c.isprintable() or c in "\n\t")
# Normalize whitespace
text = re.sub(r'\s+', ' ', text)
return text.strip()
def highlight_terms(text: str, terms: list) -> str:
"""
Highlight terms in text (for search results).
Returns text with terms wrapped in markers.
Note: For HTML output, convert markers to <mark> tags.
Args:
text: Text to highlight in
terms: Terms to highlight
Returns:
Text with highlighted terms
"""
if not text or not terms:
return text
result = text
for term in terms:
# Case-insensitive replacement with markers
pattern = re.compile(re.escape(term), re.IGNORECASE)
result = pattern.sub(f"**{term}**", result)
return result