""" Utility functions for the Email Intelligence Platform """ import re import hashlib from typing import List, Dict, Any, Optional from datetime import datetime, timezone def clean_text(text: Optional[str]) -> str: """Clean and normalize text for analysis""" if not text: return "" # Convert to lowercase text = text.lower() # Remove extra whitespace text = re.sub(r'\s+', ' ', text.strip()) # Remove special characters but keep basic punctuation text = re.sub(r'[^\w\s.,!?-]', '', text) return text def extract_email_address(sender_header: str) -> str: """Extract email address from a sender header""" if not sender_header: return "" # Try to extract email from format "Name " match = re.search(r'<([^>]+)>', sender_header) if match: return match.group(1).lower() # If no angle brackets, assume the whole string is an email if '@' in sender_header: return sender_header.strip().lower() return "" def extract_domain(email_address: str) -> str: """Extract domain from email address""" if '@' in email_address: return email_address.split('@')[-1].lower() return "" def generate_id(prefix: str, content: str) -> str: """Generate a unique ID based on content hash""" timestamp = datetime.now(timezone.utc).timestamp() hash_input = f"{content}_{timestamp}" content_hash = hashlib.sha256(hash_input.encode()).hexdigest()[:8] return f"{prefix}_{content_hash}" def extract_keywords(text: str, min_length: int = 4, max_keywords: int = 10) -> List[str]: """Extract keywords from text""" if not text: return [] # Common stop words to filter out stop_words = { 'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'has', 'have', 'been', 'were', 'they', 'this', 'that', 'with', 'from', 'your', 'will', 'would', 'could', 'should', 'what', 'when', 'where', 'which', 'their', 'there', 'these', 'those' } # Extract words words = re.findall(r'\b\w+\b', text.lower()) # Filter words keywords = [ word for word in words if len(word) >= min_length and word not in stop_words and word.isalpha() ] # Count frequency and return top keywords from collections import Counter word_freq = Counter(keywords) return [word for word, _ in word_freq.most_common(max_keywords)] def format_timestamp(dt: Optional[datetime] = None) -> str: """Format datetime to ISO string""" if dt is None: dt = datetime.now(timezone.utc) return dt.isoformat() def parse_json_safely(json_str: str, default: Any = None) -> Any: """Safely parse JSON string""" import json try: return json.loads(json_str) except (json.JSONDecodeError, TypeError): return default if default is not None else {} def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str: """Truncate text to maximum length""" if not text or len(text) <= max_length: return text return text[:max_length - len(suffix)] + suffix def calculate_confidence(scores: List[float]) -> float: """Calculate average confidence from a list of scores""" if not scores: return 0.0 return sum(scores) / len(scores) def validate_email_format(email: str) -> bool: """Validate email format""" pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' return bool(re.match(pattern, email)) def sanitize_html(html_content: str) -> str: """Remove potentially dangerous HTML tags""" if not html_content: return "" # Remove script tags html_content = re.sub(r']*>.*?', '', html_content, flags=re.DOTALL | re.IGNORECASE) # Remove style tags html_content = re.sub(r']*>.*?', '', html_content, flags=re.DOTALL | re.IGNORECASE) # Remove event handlers html_content = re.sub(r'\s+on\w+\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE) return html_content