Sandipan Haldar
adding submission
b309c22
raw
history blame
8.52 kB
"""
Utility functions for Smart Auto-Complete
Provides common functionality for text processing, logging, and validation
"""
import logging
import re
import sys
from typing import Dict, List, Optional, Tuple
import html
import unicodedata
def setup_logging(level: str = "INFO") -> logging.Logger:
"""
Set up logging configuration for the application
Args:
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
Returns:
Configured logger instance
"""
# Create logger
logger = logging.getLogger("smart_autocomplete")
logger.setLevel(getattr(logging, level.upper()))
# Remove existing handlers to avoid duplicates
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# Create console handler with formatting
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(getattr(logging, level.upper()))
# Create formatter
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
console_handler.setFormatter(formatter)
# Add handler to logger
logger.addHandler(console_handler)
return logger
def sanitize_input(text: str) -> str:
"""
Sanitize and clean input text for processing
Args:
text: Raw input text
Returns:
Cleaned and sanitized text
"""
if not text:
return ""
# Convert to string if not already
text = str(text)
# HTML escape to prevent injection
text = html.escape(text)
# Normalize unicode characters
text = unicodedata.normalize('NFKC', text)
# Remove excessive whitespace but preserve structure
text = re.sub(r'\n\s*\n\s*\n', '\n\n', text) # Max 2 consecutive newlines
text = re.sub(r'[ \t]+', ' ', text) # Multiple spaces/tabs to single space
# Remove control characters except newlines and tabs
text = ''.join(char for char in text if ord(char) >= 32 or char in '\n\t')
# Trim leading/trailing whitespace
text = text.strip()
return text
def extract_context_hints(text: str) -> Dict[str, any]:
"""
Extract contextual hints from the input text to improve suggestions
Args:
text: Input text to analyze
Returns:
Dictionary containing context hints
"""
hints = {
'length': len(text),
'word_count': len(text.split()),
'has_greeting': False,
'has_signature': False,
'has_code_markers': False,
'has_questions': False,
'tone': 'neutral',
'language_style': 'general'
}
text_lower = text.lower()
# Check for email patterns
email_greetings = ['dear', 'hello', 'hi', 'greetings', 'good morning', 'good afternoon']
email_signatures = ['sincerely', 'best regards', 'thank you', 'yours truly', 'kind regards']
hints['has_greeting'] = any(greeting in text_lower for greeting in email_greetings)
hints['has_signature'] = any(signature in text_lower for signature in email_signatures)
# Check for code patterns
code_markers = ['//', '/*', '*/', '#', 'def ', 'function', 'class ', 'import ', 'from ']
hints['has_code_markers'] = any(marker in text_lower for marker in code_markers)
# Check for questions
hints['has_questions'] = '?' in text or any(q in text_lower for q in ['what', 'how', 'why', 'when', 'where', 'who'])
# Determine tone
formal_words = ['please', 'kindly', 'respectfully', 'sincerely', 'professional']
casual_words = ['hey', 'yeah', 'cool', 'awesome', 'thanks']
formal_count = sum(1 for word in formal_words if word in text_lower)
casual_count = sum(1 for word in casual_words if word in text_lower)
if formal_count > casual_count:
hints['tone'] = 'formal'
elif casual_count > formal_count:
hints['tone'] = 'casual'
# Determine language style
if hints['has_code_markers']:
hints['language_style'] = 'technical'
elif hints['has_greeting'] or hints['has_signature']:
hints['language_style'] = 'business'
elif any(creative in text_lower for creative in ['once upon', 'story', 'character', 'plot']):
hints['language_style'] = 'creative'
return hints
def validate_api_key(api_key: str, provider: str) -> bool:
"""
Validate API key format for different providers
Args:
api_key: The API key to validate
provider: The provider name (openai, anthropic)
Returns:
True if the key format is valid, False otherwise
"""
if not api_key or not isinstance(api_key, str):
return False
api_key = api_key.strip()
if provider.lower() == 'openai':
# OpenAI keys start with 'sk-' and are typically 51 characters
return api_key.startswith('sk-') and len(api_key) >= 40
elif provider.lower() == 'anthropic':
# Anthropic keys start with 'sk-ant-'
return api_key.startswith('sk-ant-') and len(api_key) >= 40
return False
def truncate_text(text: str, max_length: int, preserve_words: bool = True) -> str:
"""
Truncate text to a maximum length while optionally preserving word boundaries
Args:
text: Text to truncate
max_length: Maximum allowed length
preserve_words: Whether to preserve word boundaries
Returns:
Truncated text
"""
if len(text) <= max_length:
return text
if not preserve_words:
return text[:max_length].rstrip() + "..."
# Find the last space before the max_length
truncated = text[:max_length]
last_space = truncated.rfind(' ')
if last_space > max_length * 0.8: # Only use word boundary if it's not too far back
return text[:last_space].rstrip() + "..."
else:
return text[:max_length].rstrip() + "..."
def format_suggestions_for_display(suggestions: List[str], max_display_length: int = 100) -> List[Dict[str, str]]:
"""
Format suggestions for display in the UI
Args:
suggestions: List of suggestion strings
max_display_length: Maximum length for display
Returns:
List of formatted suggestion dictionaries
"""
formatted = []
for i, suggestion in enumerate(suggestions, 1):
# Clean the suggestion
clean_suggestion = sanitize_input(suggestion)
# Create display version (truncated if needed)
display_text = truncate_text(clean_suggestion, max_display_length)
formatted.append({
'id': i,
'text': clean_suggestion,
'display_text': display_text,
'length': len(clean_suggestion),
'word_count': len(clean_suggestion.split())
})
return formatted
def calculate_text_similarity(text1: str, text2: str) -> float:
"""
Calculate similarity between two texts using simple word overlap
Args:
text1: First text
text2: Second text
Returns:
Similarity score between 0 and 1
"""
if not text1 or not text2:
return 0.0
# Convert to lowercase and split into words
words1 = set(text1.lower().split())
words2 = set(text2.lower().split())
# Calculate Jaccard similarity
intersection = len(words1.intersection(words2))
union = len(words1.union(words2))
return intersection / union if union > 0 else 0.0
def get_text_stats(text: str) -> Dict[str, int]:
"""
Get basic statistics about the text
Args:
text: Text to analyze
Returns:
Dictionary with text statistics
"""
if not text:
return {'characters': 0, 'words': 0, 'sentences': 0, 'paragraphs': 0}
# Count characters (excluding whitespace)
char_count = len(text.replace(' ', '').replace('\n', '').replace('\t', ''))
# Count words
word_count = len(text.split())
# Count sentences (rough estimate)
sentence_count = len(re.findall(r'[.!?]+', text))
# Count paragraphs
paragraph_count = len([p for p in text.split('\n\n') if p.strip()])
return {
'characters': char_count,
'words': word_count,
'sentences': max(1, sentence_count), # At least 1 sentence
'paragraphs': max(1, paragraph_count) # At least 1 paragraph
}