|
|
""" |
|
|
Utility functions for Smart Auto-Complete |
|
|
Provides common functionality for text processing, logging, and validation |
|
|
""" |
|
|
|
|
|
import html |
|
|
import logging |
|
|
import re |
|
|
import sys |
|
|
import unicodedata |
|
|
from typing import Dict, List, Optional, Tuple |
|
|
|
|
|
|
|
|
def setup_logging(level: str = "INFO") -> logging.Logger: |
|
|
""" |
|
|
Set up logging configuration for the application |
|
|
|
|
|
Args: |
|
|
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) |
|
|
|
|
|
Returns: |
|
|
Configured logger instance |
|
|
""" |
|
|
|
|
|
logger = logging.getLogger("smart_autocomplete") |
|
|
logger.setLevel(getattr(logging, level.upper())) |
|
|
|
|
|
|
|
|
for handler in logger.handlers[:]: |
|
|
logger.removeHandler(handler) |
|
|
|
|
|
|
|
|
console_handler = logging.StreamHandler(sys.stdout) |
|
|
console_handler.setLevel(getattr(logging, level.upper())) |
|
|
|
|
|
|
|
|
formatter = logging.Formatter( |
|
|
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", |
|
|
datefmt="%Y-%m-%d %H:%M:%S", |
|
|
) |
|
|
console_handler.setFormatter(formatter) |
|
|
|
|
|
|
|
|
logger.addHandler(console_handler) |
|
|
|
|
|
return logger |
|
|
|
|
|
|
|
|
def sanitize_input(text: str) -> str: |
|
|
""" |
|
|
Sanitize and clean input text for processing |
|
|
|
|
|
Args: |
|
|
text: Raw input text |
|
|
|
|
|
Returns: |
|
|
Cleaned and sanitized text |
|
|
""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = str(text) |
|
|
|
|
|
|
|
|
text = html.escape(text) |
|
|
|
|
|
|
|
|
text = unicodedata.normalize("NFKC", text) |
|
|
|
|
|
|
|
|
text = re.sub(r"\n\s*\n\s*\n", "\n\n", text) |
|
|
text = re.sub(r"[ \t]+", " ", text) |
|
|
|
|
|
|
|
|
text = "".join(char for char in text if ord(char) >= 32 or char in "\n\t") |
|
|
|
|
|
|
|
|
text = text.strip() |
|
|
|
|
|
return text |
|
|
|
|
|
|
|
|
def extract_context_hints(text: str) -> Dict[str, any]: |
|
|
""" |
|
|
Extract contextual hints from the input text to improve suggestions |
|
|
|
|
|
Args: |
|
|
text: Input text to analyze |
|
|
|
|
|
Returns: |
|
|
Dictionary containing context hints |
|
|
""" |
|
|
hints = { |
|
|
"length": len(text), |
|
|
"word_count": len(text.split()), |
|
|
"has_greeting": False, |
|
|
"has_signature": False, |
|
|
"has_code_markers": False, |
|
|
"has_questions": False, |
|
|
"tone": "neutral", |
|
|
"language_style": "linkedin", |
|
|
} |
|
|
|
|
|
text_lower = text.lower() |
|
|
|
|
|
|
|
|
email_greetings = [ |
|
|
"dear", |
|
|
"hello", |
|
|
"hi", |
|
|
"greetings", |
|
|
"good morning", |
|
|
"good afternoon", |
|
|
] |
|
|
email_signatures = [ |
|
|
"sincerely", |
|
|
"best regards", |
|
|
"thank you", |
|
|
"yours truly", |
|
|
"kind regards", |
|
|
] |
|
|
|
|
|
hints["has_greeting"] = any(greeting in text_lower for greeting in email_greetings) |
|
|
hints["has_signature"] = any( |
|
|
signature in text_lower for signature in email_signatures |
|
|
) |
|
|
|
|
|
|
|
|
code_markers = [ |
|
|
"//", |
|
|
"/*", |
|
|
"*/", |
|
|
"#", |
|
|
"def ", |
|
|
"function", |
|
|
"class ", |
|
|
"import ", |
|
|
"from ", |
|
|
] |
|
|
hints["has_code_markers"] = any(marker in text_lower for marker in code_markers) |
|
|
|
|
|
|
|
|
hints["has_questions"] = "?" in text or any( |
|
|
q in text_lower for q in ["what", "how", "why", "when", "where", "who"] |
|
|
) |
|
|
|
|
|
|
|
|
formal_words = ["please", "kindly", "respectfully", "sincerely", "professional"] |
|
|
casual_words = ["hey", "yeah", "cool", "awesome", "thanks"] |
|
|
|
|
|
formal_count = sum(1 for word in formal_words if word in text_lower) |
|
|
casual_count = sum(1 for word in casual_words if word in text_lower) |
|
|
|
|
|
if formal_count > casual_count: |
|
|
hints["tone"] = "formal" |
|
|
elif casual_count > formal_count: |
|
|
hints["tone"] = "casual" |
|
|
|
|
|
|
|
|
if hints["has_code_markers"]: |
|
|
hints["language_style"] = "technical" |
|
|
elif hints["has_greeting"] or hints["has_signature"]: |
|
|
hints["language_style"] = "business" |
|
|
elif any( |
|
|
creative in text_lower |
|
|
for creative in ["once upon", "story", "character", "plot"] |
|
|
): |
|
|
hints["language_style"] = "creative" |
|
|
|
|
|
return hints |
|
|
|
|
|
|
|
|
def validate_api_key(api_key: str, provider: str) -> bool: |
|
|
""" |
|
|
Validate API key format for different providers |
|
|
|
|
|
Args: |
|
|
api_key: The API key to validate |
|
|
provider: The provider name (openai, anthropic) |
|
|
|
|
|
Returns: |
|
|
True if the key format is valid, False otherwise |
|
|
""" |
|
|
if not api_key or not isinstance(api_key, str): |
|
|
return False |
|
|
|
|
|
api_key = api_key.strip() |
|
|
|
|
|
if provider.lower() == "openai": |
|
|
|
|
|
return api_key.startswith("sk-") and len(api_key) >= 40 |
|
|
elif provider.lower() == "anthropic": |
|
|
|
|
|
return api_key.startswith("sk-ant-") and len(api_key) >= 40 |
|
|
|
|
|
return False |
|
|
|
|
|
|
|
|
def truncate_text(text: str, max_length: int, preserve_words: bool = True) -> str: |
|
|
""" |
|
|
Truncate text to a maximum length while optionally preserving word boundaries |
|
|
|
|
|
Args: |
|
|
text: Text to truncate |
|
|
max_length: Maximum allowed length |
|
|
preserve_words: Whether to preserve word boundaries |
|
|
|
|
|
Returns: |
|
|
Truncated text |
|
|
""" |
|
|
if len(text) <= max_length: |
|
|
return text |
|
|
|
|
|
if not preserve_words: |
|
|
return text[:max_length].rstrip() + "..." |
|
|
|
|
|
|
|
|
truncated = text[:max_length] |
|
|
last_space = truncated.rfind(" ") |
|
|
|
|
|
if last_space > max_length * 0.8: |
|
|
return text[:last_space].rstrip() + "..." |
|
|
else: |
|
|
return text[:max_length].rstrip() + "..." |
|
|
|
|
|
|
|
|
def format_suggestions_for_display( |
|
|
suggestions: List[str], max_display_length: int = 100 |
|
|
) -> List[Dict[str, str]]: |
|
|
""" |
|
|
Format suggestions for display in the UI |
|
|
|
|
|
Args: |
|
|
suggestions: List of suggestion strings |
|
|
max_display_length: Maximum length for display |
|
|
|
|
|
Returns: |
|
|
List of formatted suggestion dictionaries |
|
|
""" |
|
|
formatted = [] |
|
|
|
|
|
for i, suggestion in enumerate(suggestions, 1): |
|
|
|
|
|
clean_suggestion = sanitize_input(suggestion) |
|
|
|
|
|
|
|
|
display_text = truncate_text(clean_suggestion, max_display_length) |
|
|
|
|
|
formatted.append( |
|
|
{ |
|
|
"id": i, |
|
|
"text": clean_suggestion, |
|
|
"display_text": display_text, |
|
|
"length": len(clean_suggestion), |
|
|
"word_count": len(clean_suggestion.split()), |
|
|
} |
|
|
) |
|
|
|
|
|
return formatted |
|
|
|
|
|
|
|
|
def calculate_text_similarity(text1: str, text2: str) -> float: |
|
|
""" |
|
|
Calculate similarity between two texts using simple word overlap |
|
|
|
|
|
Args: |
|
|
text1: First text |
|
|
text2: Second text |
|
|
|
|
|
Returns: |
|
|
Similarity score between 0 and 1 |
|
|
""" |
|
|
if not text1 or not text2: |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
words1 = set(text1.lower().split()) |
|
|
words2 = set(text2.lower().split()) |
|
|
|
|
|
|
|
|
intersection = len(words1.intersection(words2)) |
|
|
union = len(words1.union(words2)) |
|
|
|
|
|
return intersection / union if union > 0 else 0.0 |
|
|
|
|
|
|
|
|
def get_text_stats(text: str) -> Dict[str, int]: |
|
|
""" |
|
|
Get basic statistics about the text |
|
|
|
|
|
Args: |
|
|
text: Text to analyze |
|
|
|
|
|
Returns: |
|
|
Dictionary with text statistics |
|
|
""" |
|
|
if not text: |
|
|
return {"characters": 0, "words": 0, "sentences": 0, "paragraphs": 0} |
|
|
|
|
|
|
|
|
char_count = len(text.replace(" ", "").replace("\n", "").replace("\t", "")) |
|
|
|
|
|
|
|
|
word_count = len(text.split()) |
|
|
|
|
|
|
|
|
sentence_count = len(re.findall(r"[.!?]+", text)) |
|
|
|
|
|
|
|
|
paragraph_count = len([p for p in text.split("\n\n") if p.strip()]) |
|
|
|
|
|
return { |
|
|
"characters": char_count, |
|
|
"words": word_count, |
|
|
"sentences": max(1, sentence_count), |
|
|
"paragraphs": max(1, paragraph_count), |
|
|
} |
|
|
|