Spaces:

avojarot
/

HITL-KG

Sleeping

File size: 5,634 Bytes

c5880fb

"""
Text Utilities Module

Smart text processing for node labels and content display.
Preserves word boundaries and handles multi-line content.
"""

import re
from typing import Optional


def smart_truncate(text: str, max_length: int = 50, suffix: str = "...") -> str:
    """
    Truncate text at word boundaries.
    
    Args:
        text: Text to truncate
        max_length: Maximum length before truncation
        suffix: Suffix to add when truncated
        
    Returns:
        Truncated text preserving whole words
    """
    if not text:
        return ""
    
    text = text.strip()
    
    if len(text) <= max_length:
        return text
    
    # Find last space before max_length
    truncate_at = max_length - len(suffix)
    
    # Try to break at word boundary
    last_space = text.rfind(" ", 0, truncate_at)
    
    if last_space > max_length * 0.5:  # Only if reasonable amount preserved
        return text[:last_space].rstrip() + suffix
    
    # Fall back to hard truncation
    return text[:truncate_at].rstrip() + suffix


def smart_truncate_multiline(
    text: str,
    max_lines: int = 3,
    max_line_length: int = 50
) -> str:
    """
    Truncate multi-line text intelligently.
    
    Args:
        text: Multi-line text
        max_lines: Maximum number of lines
        max_line_length: Maximum length per line
        
    Returns:
        Formatted multi-line text
    """
    if not text:
        return ""
    
    lines = text.split("\n")
    result_lines = []
    
    for i, line in enumerate(lines[:max_lines]):
        truncated = smart_truncate(line.strip(), max_line_length)
        if truncated:
            result_lines.append(truncated)
    
    if len(lines) > max_lines:
        result_lines.append("...")
    
    return "\n".join(result_lines)


def create_node_label(
    content: str,
    node_type: str = "default",
    max_length: Optional[int] = None
) -> str:
    """
    Create display label for a graph node.
    
    Different node types get different truncation limits
    to optimize readability.
    
    Args:
        content: Full node content
        node_type: Type of node
        max_length: Override max length
        
    Returns:
        Formatted label for display
    """
    if not content:
        return "..."
    
    # Type-specific limits (optimized for visualization)
    type_limits = {
        "query": 45,
        "reasoning": 50,
        "hypothesis": 40,
        "conclusion": 50,
        "fact": 35,
        "evidence": 35,
        "constraint": 30,
        "ghost": 30,
        "default": 40,
    }
    
    limit = max_length or type_limits.get(node_type, type_limits["default"])
    return smart_truncate(content, limit)


def extract_key_terms(text: str, max_terms: int = 5) -> list:
    """
    Extract key terms from text for search/matching.
    
    Simple extraction based on word frequency and length.
    For production, consider using TF-IDF or KeyBERT.
    
    Args:
        text: Text to extract terms from
        max_terms: Maximum terms to return
        
    Returns:
        List of key terms
    """
    if not text:
        return []
    
    # Clean and tokenize
    text = text.lower()
    words = re.findall(r'\b[a-z]{3,}\b', text)
    
    # Filter stop words (basic list)
    stop_words = {
        "the", "and", "for", "are", "but", "not", "you", "all",
        "can", "had", "her", "was", "one", "our", "out", "has",
        "his", "how", "its", "may", "new", "now", "old", "see",
        "way", "who", "boy", "did", "get", "let", "put", "say",
        "she", "too", "use", "with", "from", "have", "this", "that",
        "been", "your", "than", "they", "will", "more", "when",
    }
    
    words = [w for w in words if w not in stop_words]
    
    # Count and sort by frequency
    word_counts = {}
    for word in words:
        word_counts[word] = word_counts.get(word, 0) + 1
    
    sorted_words = sorted(
        word_counts.items(),
        key=lambda x: (x[1], len(x[0])),  # Sort by count, then length
        reverse=True
    )
    
    return [word for word, _ in sorted_words[:max_terms]]


def format_confidence(confidence: float) -> str:
    """
    Format confidence score for display.
    
    Args:
        confidence: Score between 0 and 1
        
    Returns:
        Formatted percentage string
    """
    if confidence < 0:
        confidence = 0
    elif confidence > 1:
        confidence = 1
    
    return f"{confidence:.0%}"


def sanitize_content(text: str) -> str:
    """
    Sanitize text content for safe display.
    
    Removes or escapes potentially problematic characters.
    
    Args:
        text: Raw text
        
    Returns:
        Sanitized text
    """
    if not text:
        return ""
    
    # Remove control characters
    text = "".join(c for c in text if c.isprintable() or c in "\n\t")
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()


def highlight_terms(text: str, terms: list) -> str:
    """
    Highlight terms in text (for search results).
    
    Returns text with terms wrapped in markers.
    Note: For HTML output, convert markers to <mark> tags.
    
    Args:
        text: Text to highlight in
        terms: Terms to highlight
        
    Returns:
        Text with highlighted terms
    """
    if not text or not terms:
        return text
    
    result = text
    for term in terms:
        # Case-insensitive replacement with markers
        pattern = re.compile(re.escape(term), re.IGNORECASE)
        result = pattern.sub(f"**{term}**", result)
    
    return result