Spaces:

Agents-MCP-Hackathon
/

Smart-Auto-Complete

Sleeping

File size: 8,539 Bytes

"""
Utility functions for Smart Auto-Complete
Provides common functionality for text processing, logging, and validation
"""

import html
import logging
import re
import sys
import unicodedata
from typing import Dict, List, Optional, Tuple


def setup_logging(level: str = "INFO") -> logging.Logger:
    """
    Set up logging configuration for the application

    Args:
        level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)

    Returns:
        Configured logger instance
    """
    # Create logger
    logger = logging.getLogger("smart_autocomplete")
    logger.setLevel(getattr(logging, level.upper()))

    # Remove existing handlers to avoid duplicates
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

    # Create console handler with formatting
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(getattr(logging, level.upper()))

    # Create formatter
    formatter = logging.Formatter(
        "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )
    console_handler.setFormatter(formatter)

    # Add handler to logger
    logger.addHandler(console_handler)

    return logger


def sanitize_input(text: str) -> str:
    """
    Sanitize and clean input text for processing

    Args:
        text: Raw input text

    Returns:
        Cleaned and sanitized text
    """
    if not text:
        return ""

    # Convert to string if not already
    text = str(text)

    # HTML escape to prevent injection
    text = html.escape(text)

    # Normalize unicode characters
    text = unicodedata.normalize("NFKC", text)

    # Remove excessive whitespace but preserve structure
    text = re.sub(r"\n\s*\n\s*\n", "\n\n", text)  # Max 2 consecutive newlines
    text = re.sub(r"[ \t]+", " ", text)  # Multiple spaces/tabs to single space

    # Remove control characters except newlines and tabs
    text = "".join(char for char in text if ord(char) >= 32 or char in "\n\t")

    # Trim leading/trailing whitespace
    text = text.strip()

    return text


def extract_context_hints(text: str) -> Dict[str, any]:
    """
    Extract contextual hints from the input text to improve suggestions

    Args:
        text: Input text to analyze

    Returns:
        Dictionary containing context hints
    """
    hints = {
        "length": len(text),
        "word_count": len(text.split()),
        "has_greeting": False,
        "has_signature": False,
        "has_code_markers": False,
        "has_questions": False,
        "tone": "neutral",
        "language_style": "linkedin",
    }

    text_lower = text.lower()

    # Check for email patterns
    email_greetings = [
        "dear",
        "hello",
        "hi",
        "greetings",
        "good morning",
        "good afternoon",
    ]
    email_signatures = [
        "sincerely",
        "best regards",
        "thank you",
        "yours truly",
        "kind regards",
    ]

    hints["has_greeting"] = any(greeting in text_lower for greeting in email_greetings)
    hints["has_signature"] = any(
        signature in text_lower for signature in email_signatures
    )

    # Check for code patterns
    code_markers = [
        "//",
        "/*",
        "*/",
        "#",
        "def ",
        "function",
        "class ",
        "import ",
        "from ",
    ]
    hints["has_code_markers"] = any(marker in text_lower for marker in code_markers)

    # Check for questions
    hints["has_questions"] = "?" in text or any(
        q in text_lower for q in ["what", "how", "why", "when", "where", "who"]
    )

    # Determine tone
    formal_words = ["please", "kindly", "respectfully", "sincerely", "professional"]
    casual_words = ["hey", "yeah", "cool", "awesome", "thanks"]

    formal_count = sum(1 for word in formal_words if word in text_lower)
    casual_count = sum(1 for word in casual_words if word in text_lower)

    if formal_count > casual_count:
        hints["tone"] = "formal"
    elif casual_count > formal_count:
        hints["tone"] = "casual"

    # Determine language style
    if hints["has_code_markers"]:
        hints["language_style"] = "technical"
    elif hints["has_greeting"] or hints["has_signature"]:
        hints["language_style"] = "business"
    elif any(
        creative in text_lower
        for creative in ["once upon", "story", "character", "plot"]
    ):
        hints["language_style"] = "creative"

    return hints


def validate_api_key(api_key: str, provider: str) -> bool:
    """
    Validate API key format for different providers

    Args:
        api_key: The API key to validate
        provider: The provider name (openai, anthropic)

    Returns:
        True if the key format is valid, False otherwise
    """
    if not api_key or not isinstance(api_key, str):
        return False

    api_key = api_key.strip()

    if provider.lower() == "openai":
        # OpenAI keys start with 'sk-' and are typically 51 characters
        return api_key.startswith("sk-") and len(api_key) >= 40
    elif provider.lower() == "anthropic":
        # Anthropic keys start with 'sk-ant-'
        return api_key.startswith("sk-ant-") and len(api_key) >= 40

    return False


def truncate_text(text: str, max_length: int, preserve_words: bool = True) -> str:
    """
    Truncate text to a maximum length while optionally preserving word boundaries

    Args:
        text: Text to truncate
        max_length: Maximum allowed length
        preserve_words: Whether to preserve word boundaries

    Returns:
        Truncated text
    """
    if len(text) <= max_length:
        return text

    if not preserve_words:
        return text[:max_length].rstrip() + "..."

    # Find the last space before the max_length
    truncated = text[:max_length]
    last_space = truncated.rfind(" ")

    if last_space > max_length * 0.8:  # Only use word boundary if it's not too far back
        return text[:last_space].rstrip() + "..."
    else:
        return text[:max_length].rstrip() + "..."


def format_suggestions_for_display(
    suggestions: List[str], max_display_length: int = 100
) -> List[Dict[str, str]]:
    """
    Format suggestions for display in the UI

    Args:
        suggestions: List of suggestion strings
        max_display_length: Maximum length for display

    Returns:
        List of formatted suggestion dictionaries
    """
    formatted = []

    for i, suggestion in enumerate(suggestions, 1):
        # Clean the suggestion
        clean_suggestion = sanitize_input(suggestion)

        # Create display version (truncated if needed)
        display_text = truncate_text(clean_suggestion, max_display_length)

        formatted.append(
            {
                "id": i,
                "text": clean_suggestion,
                "display_text": display_text,
                "length": len(clean_suggestion),
                "word_count": len(clean_suggestion.split()),
            }
        )

    return formatted


def calculate_text_similarity(text1: str, text2: str) -> float:
    """
    Calculate similarity between two texts using simple word overlap

    Args:
        text1: First text
        text2: Second text

    Returns:
        Similarity score between 0 and 1
    """
    if not text1 or not text2:
        return 0.0

    # Convert to lowercase and split into words
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())

    # Calculate Jaccard similarity
    intersection = len(words1.intersection(words2))
    union = len(words1.union(words2))

    return intersection / union if union > 0 else 0.0


def get_text_stats(text: str) -> Dict[str, int]:
    """
    Get basic statistics about the text

    Args:
        text: Text to analyze

    Returns:
        Dictionary with text statistics
    """
    if not text:
        return {"characters": 0, "words": 0, "sentences": 0, "paragraphs": 0}

    # Count characters (excluding whitespace)
    char_count = len(text.replace(" ", "").replace("\n", "").replace("\t", ""))

    # Count words
    word_count = len(text.split())

    # Count sentences (rough estimate)
    sentence_count = len(re.findall(r"[.!?]+", text))

    # Count paragraphs
    paragraph_count = len([p for p in text.split("\n\n") if p.strip()])

    return {
        "characters": char_count,
        "words": word_count,
        "sentences": max(1, sentence_count),  # At least 1 sentence
        "paragraphs": max(1, paragraph_count),  # At least 1 paragraph
    }