|
|
""" |
|
|
Utility functions for Smart Auto-Complete |
|
|
Provides common functionality for text processing, logging, and validation |
|
|
""" |
|
|
|
|
|
import logging |
|
|
import re |
|
|
import sys |
|
|
from typing import Dict, List, Optional, Tuple |
|
|
import html |
|
|
import unicodedata |
|
|
|
|
|
|
|
|
def setup_logging(level: str = "INFO") -> logging.Logger: |
|
|
""" |
|
|
Set up logging configuration for the application |
|
|
|
|
|
Args: |
|
|
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) |
|
|
|
|
|
Returns: |
|
|
Configured logger instance |
|
|
""" |
|
|
|
|
|
logger = logging.getLogger("smart_autocomplete") |
|
|
logger.setLevel(getattr(logging, level.upper())) |
|
|
|
|
|
|
|
|
for handler in logger.handlers[:]: |
|
|
logger.removeHandler(handler) |
|
|
|
|
|
|
|
|
console_handler = logging.StreamHandler(sys.stdout) |
|
|
console_handler.setLevel(getattr(logging, level.upper())) |
|
|
|
|
|
|
|
|
formatter = logging.Formatter( |
|
|
'%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|
|
datefmt='%Y-%m-%d %H:%M:%S' |
|
|
) |
|
|
console_handler.setFormatter(formatter) |
|
|
|
|
|
|
|
|
logger.addHandler(console_handler) |
|
|
|
|
|
return logger |
|
|
|
|
|
|
|
|
def sanitize_input(text: str) -> str: |
|
|
""" |
|
|
Sanitize and clean input text for processing |
|
|
|
|
|
Args: |
|
|
text: Raw input text |
|
|
|
|
|
Returns: |
|
|
Cleaned and sanitized text |
|
|
""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = str(text) |
|
|
|
|
|
|
|
|
text = html.escape(text) |
|
|
|
|
|
|
|
|
text = unicodedata.normalize('NFKC', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'\n\s*\n\s*\n', '\n\n', text) |
|
|
text = re.sub(r'[ \t]+', ' ', text) |
|
|
|
|
|
|
|
|
text = ''.join(char for char in text if ord(char) >= 32 or char in '\n\t') |
|
|
|
|
|
|
|
|
text = text.strip() |
|
|
|
|
|
return text |
|
|
|
|
|
|
|
|
def extract_context_hints(text: str) -> Dict[str, any]: |
|
|
""" |
|
|
Extract contextual hints from the input text to improve suggestions |
|
|
|
|
|
Args: |
|
|
text: Input text to analyze |
|
|
|
|
|
Returns: |
|
|
Dictionary containing context hints |
|
|
""" |
|
|
hints = { |
|
|
'length': len(text), |
|
|
'word_count': len(text.split()), |
|
|
'has_greeting': False, |
|
|
'has_signature': False, |
|
|
'has_code_markers': False, |
|
|
'has_questions': False, |
|
|
'tone': 'neutral', |
|
|
'language_style': 'general' |
|
|
} |
|
|
|
|
|
text_lower = text.lower() |
|
|
|
|
|
|
|
|
email_greetings = ['dear', 'hello', 'hi', 'greetings', 'good morning', 'good afternoon'] |
|
|
email_signatures = ['sincerely', 'best regards', 'thank you', 'yours truly', 'kind regards'] |
|
|
|
|
|
hints['has_greeting'] = any(greeting in text_lower for greeting in email_greetings) |
|
|
hints['has_signature'] = any(signature in text_lower for signature in email_signatures) |
|
|
|
|
|
|
|
|
code_markers = ['//', '/*', '*/', '#', 'def ', 'function', 'class ', 'import ', 'from '] |
|
|
hints['has_code_markers'] = any(marker in text_lower for marker in code_markers) |
|
|
|
|
|
|
|
|
hints['has_questions'] = '?' in text or any(q in text_lower for q in ['what', 'how', 'why', 'when', 'where', 'who']) |
|
|
|
|
|
|
|
|
formal_words = ['please', 'kindly', 'respectfully', 'sincerely', 'professional'] |
|
|
casual_words = ['hey', 'yeah', 'cool', 'awesome', 'thanks'] |
|
|
|
|
|
formal_count = sum(1 for word in formal_words if word in text_lower) |
|
|
casual_count = sum(1 for word in casual_words if word in text_lower) |
|
|
|
|
|
if formal_count > casual_count: |
|
|
hints['tone'] = 'formal' |
|
|
elif casual_count > formal_count: |
|
|
hints['tone'] = 'casual' |
|
|
|
|
|
|
|
|
if hints['has_code_markers']: |
|
|
hints['language_style'] = 'technical' |
|
|
elif hints['has_greeting'] or hints['has_signature']: |
|
|
hints['language_style'] = 'business' |
|
|
elif any(creative in text_lower for creative in ['once upon', 'story', 'character', 'plot']): |
|
|
hints['language_style'] = 'creative' |
|
|
|
|
|
return hints |
|
|
|
|
|
|
|
|
def validate_api_key(api_key: str, provider: str) -> bool: |
|
|
""" |
|
|
Validate API key format for different providers |
|
|
|
|
|
Args: |
|
|
api_key: The API key to validate |
|
|
provider: The provider name (openai, anthropic) |
|
|
|
|
|
Returns: |
|
|
True if the key format is valid, False otherwise |
|
|
""" |
|
|
if not api_key or not isinstance(api_key, str): |
|
|
return False |
|
|
|
|
|
api_key = api_key.strip() |
|
|
|
|
|
if provider.lower() == 'openai': |
|
|
|
|
|
return api_key.startswith('sk-') and len(api_key) >= 40 |
|
|
elif provider.lower() == 'anthropic': |
|
|
|
|
|
return api_key.startswith('sk-ant-') and len(api_key) >= 40 |
|
|
|
|
|
return False |
|
|
|
|
|
|
|
|
def truncate_text(text: str, max_length: int, preserve_words: bool = True) -> str: |
|
|
""" |
|
|
Truncate text to a maximum length while optionally preserving word boundaries |
|
|
|
|
|
Args: |
|
|
text: Text to truncate |
|
|
max_length: Maximum allowed length |
|
|
preserve_words: Whether to preserve word boundaries |
|
|
|
|
|
Returns: |
|
|
Truncated text |
|
|
""" |
|
|
if len(text) <= max_length: |
|
|
return text |
|
|
|
|
|
if not preserve_words: |
|
|
return text[:max_length].rstrip() + "..." |
|
|
|
|
|
|
|
|
truncated = text[:max_length] |
|
|
last_space = truncated.rfind(' ') |
|
|
|
|
|
if last_space > max_length * 0.8: |
|
|
return text[:last_space].rstrip() + "..." |
|
|
else: |
|
|
return text[:max_length].rstrip() + "..." |
|
|
|
|
|
|
|
|
def format_suggestions_for_display(suggestions: List[str], max_display_length: int = 100) -> List[Dict[str, str]]: |
|
|
""" |
|
|
Format suggestions for display in the UI |
|
|
|
|
|
Args: |
|
|
suggestions: List of suggestion strings |
|
|
max_display_length: Maximum length for display |
|
|
|
|
|
Returns: |
|
|
List of formatted suggestion dictionaries |
|
|
""" |
|
|
formatted = [] |
|
|
|
|
|
for i, suggestion in enumerate(suggestions, 1): |
|
|
|
|
|
clean_suggestion = sanitize_input(suggestion) |
|
|
|
|
|
|
|
|
display_text = truncate_text(clean_suggestion, max_display_length) |
|
|
|
|
|
formatted.append({ |
|
|
'id': i, |
|
|
'text': clean_suggestion, |
|
|
'display_text': display_text, |
|
|
'length': len(clean_suggestion), |
|
|
'word_count': len(clean_suggestion.split()) |
|
|
}) |
|
|
|
|
|
return formatted |
|
|
|
|
|
|
|
|
def calculate_text_similarity(text1: str, text2: str) -> float: |
|
|
""" |
|
|
Calculate similarity between two texts using simple word overlap |
|
|
|
|
|
Args: |
|
|
text1: First text |
|
|
text2: Second text |
|
|
|
|
|
Returns: |
|
|
Similarity score between 0 and 1 |
|
|
""" |
|
|
if not text1 or not text2: |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
words1 = set(text1.lower().split()) |
|
|
words2 = set(text2.lower().split()) |
|
|
|
|
|
|
|
|
intersection = len(words1.intersection(words2)) |
|
|
union = len(words1.union(words2)) |
|
|
|
|
|
return intersection / union if union > 0 else 0.0 |
|
|
|
|
|
|
|
|
def get_text_stats(text: str) -> Dict[str, int]: |
|
|
""" |
|
|
Get basic statistics about the text |
|
|
|
|
|
Args: |
|
|
text: Text to analyze |
|
|
|
|
|
Returns: |
|
|
Dictionary with text statistics |
|
|
""" |
|
|
if not text: |
|
|
return {'characters': 0, 'words': 0, 'sentences': 0, 'paragraphs': 0} |
|
|
|
|
|
|
|
|
char_count = len(text.replace(' ', '').replace('\n', '').replace('\t', '')) |
|
|
|
|
|
|
|
|
word_count = len(text.split()) |
|
|
|
|
|
|
|
|
sentence_count = len(re.findall(r'[.!?]+', text)) |
|
|
|
|
|
|
|
|
paragraph_count = len([p for p in text.split('\n\n') if p.strip()]) |
|
|
|
|
|
return { |
|
|
'characters': char_count, |
|
|
'words': word_count, |
|
|
'sentences': max(1, sentence_count), |
|
|
'paragraphs': max(1, paragraph_count) |
|
|
} |
|
|
|