Sandipan Haldar
feat: Replace general context with LinkedIn-specific context
770544d
"""
Utility functions for Smart Auto-Complete
Provides common functionality for text processing, logging, and validation
"""
import html
import logging
import re
import sys
import unicodedata
from typing import Dict, List, Optional, Tuple
def setup_logging(level: str = "INFO") -> logging.Logger:
"""
Set up logging configuration for the application
Args:
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
Returns:
Configured logger instance
"""
# Create logger
logger = logging.getLogger("smart_autocomplete")
logger.setLevel(getattr(logging, level.upper()))
# Remove existing handlers to avoid duplicates
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# Create console handler with formatting
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(getattr(logging, level.upper()))
# Create formatter
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
console_handler.setFormatter(formatter)
# Add handler to logger
logger.addHandler(console_handler)
return logger
def sanitize_input(text: str) -> str:
"""
Sanitize and clean input text for processing
Args:
text: Raw input text
Returns:
Cleaned and sanitized text
"""
if not text:
return ""
# Convert to string if not already
text = str(text)
# HTML escape to prevent injection
text = html.escape(text)
# Normalize unicode characters
text = unicodedata.normalize("NFKC", text)
# Remove excessive whitespace but preserve structure
text = re.sub(r"\n\s*\n\s*\n", "\n\n", text) # Max 2 consecutive newlines
text = re.sub(r"[ \t]+", " ", text) # Multiple spaces/tabs to single space
# Remove control characters except newlines and tabs
text = "".join(char for char in text if ord(char) >= 32 or char in "\n\t")
# Trim leading/trailing whitespace
text = text.strip()
return text
def extract_context_hints(text: str) -> Dict[str, any]:
"""
Extract contextual hints from the input text to improve suggestions
Args:
text: Input text to analyze
Returns:
Dictionary containing context hints
"""
hints = {
"length": len(text),
"word_count": len(text.split()),
"has_greeting": False,
"has_signature": False,
"has_code_markers": False,
"has_questions": False,
"tone": "neutral",
"language_style": "linkedin",
}
text_lower = text.lower()
# Check for email patterns
email_greetings = [
"dear",
"hello",
"hi",
"greetings",
"good morning",
"good afternoon",
]
email_signatures = [
"sincerely",
"best regards",
"thank you",
"yours truly",
"kind regards",
]
hints["has_greeting"] = any(greeting in text_lower for greeting in email_greetings)
hints["has_signature"] = any(
signature in text_lower for signature in email_signatures
)
# Check for code patterns
code_markers = [
"//",
"/*",
"*/",
"#",
"def ",
"function",
"class ",
"import ",
"from ",
]
hints["has_code_markers"] = any(marker in text_lower for marker in code_markers)
# Check for questions
hints["has_questions"] = "?" in text or any(
q in text_lower for q in ["what", "how", "why", "when", "where", "who"]
)
# Determine tone
formal_words = ["please", "kindly", "respectfully", "sincerely", "professional"]
casual_words = ["hey", "yeah", "cool", "awesome", "thanks"]
formal_count = sum(1 for word in formal_words if word in text_lower)
casual_count = sum(1 for word in casual_words if word in text_lower)
if formal_count > casual_count:
hints["tone"] = "formal"
elif casual_count > formal_count:
hints["tone"] = "casual"
# Determine language style
if hints["has_code_markers"]:
hints["language_style"] = "technical"
elif hints["has_greeting"] or hints["has_signature"]:
hints["language_style"] = "business"
elif any(
creative in text_lower
for creative in ["once upon", "story", "character", "plot"]
):
hints["language_style"] = "creative"
return hints
def validate_api_key(api_key: str, provider: str) -> bool:
"""
Validate API key format for different providers
Args:
api_key: The API key to validate
provider: The provider name (openai, anthropic)
Returns:
True if the key format is valid, False otherwise
"""
if not api_key or not isinstance(api_key, str):
return False
api_key = api_key.strip()
if provider.lower() == "openai":
# OpenAI keys start with 'sk-' and are typically 51 characters
return api_key.startswith("sk-") and len(api_key) >= 40
elif provider.lower() == "anthropic":
# Anthropic keys start with 'sk-ant-'
return api_key.startswith("sk-ant-") and len(api_key) >= 40
return False
def truncate_text(text: str, max_length: int, preserve_words: bool = True) -> str:
"""
Truncate text to a maximum length while optionally preserving word boundaries
Args:
text: Text to truncate
max_length: Maximum allowed length
preserve_words: Whether to preserve word boundaries
Returns:
Truncated text
"""
if len(text) <= max_length:
return text
if not preserve_words:
return text[:max_length].rstrip() + "..."
# Find the last space before the max_length
truncated = text[:max_length]
last_space = truncated.rfind(" ")
if last_space > max_length * 0.8: # Only use word boundary if it's not too far back
return text[:last_space].rstrip() + "..."
else:
return text[:max_length].rstrip() + "..."
def format_suggestions_for_display(
suggestions: List[str], max_display_length: int = 100
) -> List[Dict[str, str]]:
"""
Format suggestions for display in the UI
Args:
suggestions: List of suggestion strings
max_display_length: Maximum length for display
Returns:
List of formatted suggestion dictionaries
"""
formatted = []
for i, suggestion in enumerate(suggestions, 1):
# Clean the suggestion
clean_suggestion = sanitize_input(suggestion)
# Create display version (truncated if needed)
display_text = truncate_text(clean_suggestion, max_display_length)
formatted.append(
{
"id": i,
"text": clean_suggestion,
"display_text": display_text,
"length": len(clean_suggestion),
"word_count": len(clean_suggestion.split()),
}
)
return formatted
def calculate_text_similarity(text1: str, text2: str) -> float:
"""
Calculate similarity between two texts using simple word overlap
Args:
text1: First text
text2: Second text
Returns:
Similarity score between 0 and 1
"""
if not text1 or not text2:
return 0.0
# Convert to lowercase and split into words
words1 = set(text1.lower().split())
words2 = set(text2.lower().split())
# Calculate Jaccard similarity
intersection = len(words1.intersection(words2))
union = len(words1.union(words2))
return intersection / union if union > 0 else 0.0
def get_text_stats(text: str) -> Dict[str, int]:
"""
Get basic statistics about the text
Args:
text: Text to analyze
Returns:
Dictionary with text statistics
"""
if not text:
return {"characters": 0, "words": 0, "sentences": 0, "paragraphs": 0}
# Count characters (excluding whitespace)
char_count = len(text.replace(" ", "").replace("\n", "").replace("\t", ""))
# Count words
word_count = len(text.split())
# Count sentences (rough estimate)
sentence_count = len(re.findall(r"[.!?]+", text))
# Count paragraphs
paragraph_count = len([p for p in text.split("\n\n") if p.strip()])
return {
"characters": char_count,
"words": word_count,
"sentences": max(1, sentence_count), # At least 1 sentence
"paragraphs": max(1, paragraph_count), # At least 1 paragraph
}