File size: 5,634 Bytes
c5880fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
"""
Text Utilities Module
Smart text processing for node labels and content display.
Preserves word boundaries and handles multi-line content.
"""
import re
from typing import Optional
def smart_truncate(text: str, max_length: int = 50, suffix: str = "...") -> str:
"""
Truncate text at word boundaries.
Args:
text: Text to truncate
max_length: Maximum length before truncation
suffix: Suffix to add when truncated
Returns:
Truncated text preserving whole words
"""
if not text:
return ""
text = text.strip()
if len(text) <= max_length:
return text
# Find last space before max_length
truncate_at = max_length - len(suffix)
# Try to break at word boundary
last_space = text.rfind(" ", 0, truncate_at)
if last_space > max_length * 0.5: # Only if reasonable amount preserved
return text[:last_space].rstrip() + suffix
# Fall back to hard truncation
return text[:truncate_at].rstrip() + suffix
def smart_truncate_multiline(
text: str,
max_lines: int = 3,
max_line_length: int = 50
) -> str:
"""
Truncate multi-line text intelligently.
Args:
text: Multi-line text
max_lines: Maximum number of lines
max_line_length: Maximum length per line
Returns:
Formatted multi-line text
"""
if not text:
return ""
lines = text.split("\n")
result_lines = []
for i, line in enumerate(lines[:max_lines]):
truncated = smart_truncate(line.strip(), max_line_length)
if truncated:
result_lines.append(truncated)
if len(lines) > max_lines:
result_lines.append("...")
return "\n".join(result_lines)
def create_node_label(
content: str,
node_type: str = "default",
max_length: Optional[int] = None
) -> str:
"""
Create display label for a graph node.
Different node types get different truncation limits
to optimize readability.
Args:
content: Full node content
node_type: Type of node
max_length: Override max length
Returns:
Formatted label for display
"""
if not content:
return "..."
# Type-specific limits (optimized for visualization)
type_limits = {
"query": 45,
"reasoning": 50,
"hypothesis": 40,
"conclusion": 50,
"fact": 35,
"evidence": 35,
"constraint": 30,
"ghost": 30,
"default": 40,
}
limit = max_length or type_limits.get(node_type, type_limits["default"])
return smart_truncate(content, limit)
def extract_key_terms(text: str, max_terms: int = 5) -> list:
"""
Extract key terms from text for search/matching.
Simple extraction based on word frequency and length.
For production, consider using TF-IDF or KeyBERT.
Args:
text: Text to extract terms from
max_terms: Maximum terms to return
Returns:
List of key terms
"""
if not text:
return []
# Clean and tokenize
text = text.lower()
words = re.findall(r'\b[a-z]{3,}\b', text)
# Filter stop words (basic list)
stop_words = {
"the", "and", "for", "are", "but", "not", "you", "all",
"can", "had", "her", "was", "one", "our", "out", "has",
"his", "how", "its", "may", "new", "now", "old", "see",
"way", "who", "boy", "did", "get", "let", "put", "say",
"she", "too", "use", "with", "from", "have", "this", "that",
"been", "your", "than", "they", "will", "more", "when",
}
words = [w for w in words if w not in stop_words]
# Count and sort by frequency
word_counts = {}
for word in words:
word_counts[word] = word_counts.get(word, 0) + 1
sorted_words = sorted(
word_counts.items(),
key=lambda x: (x[1], len(x[0])), # Sort by count, then length
reverse=True
)
return [word for word, _ in sorted_words[:max_terms]]
def format_confidence(confidence: float) -> str:
"""
Format confidence score for display.
Args:
confidence: Score between 0 and 1
Returns:
Formatted percentage string
"""
if confidence < 0:
confidence = 0
elif confidence > 1:
confidence = 1
return f"{confidence:.0%}"
def sanitize_content(text: str) -> str:
"""
Sanitize text content for safe display.
Removes or escapes potentially problematic characters.
Args:
text: Raw text
Returns:
Sanitized text
"""
if not text:
return ""
# Remove control characters
text = "".join(c for c in text if c.isprintable() or c in "\n\t")
# Normalize whitespace
text = re.sub(r'\s+', ' ', text)
return text.strip()
def highlight_terms(text: str, terms: list) -> str:
"""
Highlight terms in text (for search results).
Returns text with terms wrapped in markers.
Note: For HTML output, convert markers to <mark> tags.
Args:
text: Text to highlight in
terms: Terms to highlight
Returns:
Text with highlighted terms
"""
if not text or not terms:
return text
result = text
for term in terms:
# Case-insensitive replacement with markers
pattern = re.compile(re.escape(term), re.IGNORECASE)
result = pattern.sub(f"**{term}**", result)
return result
|