|
|
""" |
|
|
Text Utilities Module |
|
|
|
|
|
Smart text processing for node labels and content display. |
|
|
Preserves word boundaries and handles multi-line content. |
|
|
""" |
|
|
|
|
|
import re |
|
|
from typing import Optional |
|
|
|
|
|
|
|
|
def smart_truncate(text: str, max_length: int = 50, suffix: str = "...") -> str: |
|
|
""" |
|
|
Truncate text at word boundaries. |
|
|
|
|
|
Args: |
|
|
text: Text to truncate |
|
|
max_length: Maximum length before truncation |
|
|
suffix: Suffix to add when truncated |
|
|
|
|
|
Returns: |
|
|
Truncated text preserving whole words |
|
|
""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
text = text.strip() |
|
|
|
|
|
if len(text) <= max_length: |
|
|
return text |
|
|
|
|
|
|
|
|
truncate_at = max_length - len(suffix) |
|
|
|
|
|
|
|
|
last_space = text.rfind(" ", 0, truncate_at) |
|
|
|
|
|
if last_space > max_length * 0.5: |
|
|
return text[:last_space].rstrip() + suffix |
|
|
|
|
|
|
|
|
return text[:truncate_at].rstrip() + suffix |
|
|
|
|
|
|
|
|
def smart_truncate_multiline( |
|
|
text: str, |
|
|
max_lines: int = 3, |
|
|
max_line_length: int = 50 |
|
|
) -> str: |
|
|
""" |
|
|
Truncate multi-line text intelligently. |
|
|
|
|
|
Args: |
|
|
text: Multi-line text |
|
|
max_lines: Maximum number of lines |
|
|
max_line_length: Maximum length per line |
|
|
|
|
|
Returns: |
|
|
Formatted multi-line text |
|
|
""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
lines = text.split("\n") |
|
|
result_lines = [] |
|
|
|
|
|
for i, line in enumerate(lines[:max_lines]): |
|
|
truncated = smart_truncate(line.strip(), max_line_length) |
|
|
if truncated: |
|
|
result_lines.append(truncated) |
|
|
|
|
|
if len(lines) > max_lines: |
|
|
result_lines.append("...") |
|
|
|
|
|
return "\n".join(result_lines) |
|
|
|
|
|
|
|
|
def create_node_label( |
|
|
content: str, |
|
|
node_type: str = "default", |
|
|
max_length: Optional[int] = None |
|
|
) -> str: |
|
|
""" |
|
|
Create display label for a graph node. |
|
|
|
|
|
Different node types get different truncation limits |
|
|
to optimize readability. |
|
|
|
|
|
Args: |
|
|
content: Full node content |
|
|
node_type: Type of node |
|
|
max_length: Override max length |
|
|
|
|
|
Returns: |
|
|
Formatted label for display |
|
|
""" |
|
|
if not content: |
|
|
return "..." |
|
|
|
|
|
|
|
|
type_limits = { |
|
|
"query": 45, |
|
|
"reasoning": 50, |
|
|
"hypothesis": 40, |
|
|
"conclusion": 50, |
|
|
"fact": 35, |
|
|
"evidence": 35, |
|
|
"constraint": 30, |
|
|
"ghost": 30, |
|
|
"default": 40, |
|
|
} |
|
|
|
|
|
limit = max_length or type_limits.get(node_type, type_limits["default"]) |
|
|
return smart_truncate(content, limit) |
|
|
|
|
|
|
|
|
def extract_key_terms(text: str, max_terms: int = 5) -> list: |
|
|
""" |
|
|
Extract key terms from text for search/matching. |
|
|
|
|
|
Simple extraction based on word frequency and length. |
|
|
For production, consider using TF-IDF or KeyBERT. |
|
|
|
|
|
Args: |
|
|
text: Text to extract terms from |
|
|
max_terms: Maximum terms to return |
|
|
|
|
|
Returns: |
|
|
List of key terms |
|
|
""" |
|
|
if not text: |
|
|
return [] |
|
|
|
|
|
|
|
|
text = text.lower() |
|
|
words = re.findall(r'\b[a-z]{3,}\b', text) |
|
|
|
|
|
|
|
|
stop_words = { |
|
|
"the", "and", "for", "are", "but", "not", "you", "all", |
|
|
"can", "had", "her", "was", "one", "our", "out", "has", |
|
|
"his", "how", "its", "may", "new", "now", "old", "see", |
|
|
"way", "who", "boy", "did", "get", "let", "put", "say", |
|
|
"she", "too", "use", "with", "from", "have", "this", "that", |
|
|
"been", "your", "than", "they", "will", "more", "when", |
|
|
} |
|
|
|
|
|
words = [w for w in words if w not in stop_words] |
|
|
|
|
|
|
|
|
word_counts = {} |
|
|
for word in words: |
|
|
word_counts[word] = word_counts.get(word, 0) + 1 |
|
|
|
|
|
sorted_words = sorted( |
|
|
word_counts.items(), |
|
|
key=lambda x: (x[1], len(x[0])), |
|
|
reverse=True |
|
|
) |
|
|
|
|
|
return [word for word, _ in sorted_words[:max_terms]] |
|
|
|
|
|
|
|
|
def format_confidence(confidence: float) -> str: |
|
|
""" |
|
|
Format confidence score for display. |
|
|
|
|
|
Args: |
|
|
confidence: Score between 0 and 1 |
|
|
|
|
|
Returns: |
|
|
Formatted percentage string |
|
|
""" |
|
|
if confidence < 0: |
|
|
confidence = 0 |
|
|
elif confidence > 1: |
|
|
confidence = 1 |
|
|
|
|
|
return f"{confidence:.0%}" |
|
|
|
|
|
|
|
|
def sanitize_content(text: str) -> str: |
|
|
""" |
|
|
Sanitize text content for safe display. |
|
|
|
|
|
Removes or escapes potentially problematic characters. |
|
|
|
|
|
Args: |
|
|
text: Raw text |
|
|
|
|
|
Returns: |
|
|
Sanitized text |
|
|
""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = "".join(c for c in text if c.isprintable() or c in "\n\t") |
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
|
|
|
def highlight_terms(text: str, terms: list) -> str: |
|
|
""" |
|
|
Highlight terms in text (for search results). |
|
|
|
|
|
Returns text with terms wrapped in markers. |
|
|
Note: For HTML output, convert markers to <mark> tags. |
|
|
|
|
|
Args: |
|
|
text: Text to highlight in |
|
|
terms: Terms to highlight |
|
|
|
|
|
Returns: |
|
|
Text with highlighted terms |
|
|
""" |
|
|
if not text or not terms: |
|
|
return text |
|
|
|
|
|
result = text |
|
|
for term in terms: |
|
|
|
|
|
pattern = re.compile(re.escape(term), re.IGNORECASE) |
|
|
result = pattern.sub(f"**{term}**", result) |
|
|
|
|
|
return result |
|
|
|