Spaces:

criticalmaz
/

anycoder-545af39a

Runtime error

App Files Files Community

anycoder-545af39a / utils.py

criticalmaz

Update utils.py from anycoder

bcb6b46 verified 4 months ago

raw

history blame contribute delete

4.18 kB

	"""
	Utility functions for the Email Intelligence Platform
	"""

	import re
	import hashlib
	from typing import List, Dict, Any, Optional
	from datetime import datetime, timezone


	def clean_text(text: Optional[str]) -> str:
	"""Clean and normalize text for analysis"""
	if not text:
	return ""

	# Convert to lowercase
	text = text.lower()

	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text.strip())

	# Remove special characters but keep basic punctuation
	text = re.sub(r'[^\w\s.,!?-]', '', text)

	return text


	def extract_email_address(sender_header: str) -> str:
	"""Extract email address from a sender header"""
	if not sender_header:
	return ""

	# Try to extract email from format "Name <email@domain.com>"
	match = re.search(r'<([^>]+)>', sender_header)
	if match:
	return match.group(1).lower()

	# If no angle brackets, assume the whole string is an email
	if '@' in sender_header:
	return sender_header.strip().lower()

	return ""


	def extract_domain(email_address: str) -> str:
	"""Extract domain from email address"""
	if '@' in email_address:
	return email_address.split('@')[-1].lower()
	return ""


	def generate_id(prefix: str, content: str) -> str:
	"""Generate a unique ID based on content hash"""
	timestamp = datetime.now(timezone.utc).timestamp()
	hash_input = f"{content}_{timestamp}"
	content_hash = hashlib.sha256(hash_input.encode()).hexdigest()[:8]
	return f"{prefix}_{content_hash}"


	def extract_keywords(text: str, min_length: int = 4, max_keywords: int = 10) -> List[str]:
	"""Extract keywords from text"""
	if not text:
	return []

	# Common stop words to filter out
	stop_words = {
	'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had',
	'her', 'was', 'one', 'our', 'out', 'has', 'have', 'been', 'were', 'they',
	'this', 'that', 'with', 'from', 'your', 'will', 'would', 'could', 'should',
	'what', 'when', 'where', 'which', 'their', 'there', 'these', 'those'
	}

	# Extract words
	words = re.findall(r'\b\w+\b', text.lower())

	# Filter words
	keywords = [
	word for word in words
	if len(word) >= min_length and word not in stop_words and word.isalpha()
	]

	# Count frequency and return top keywords
	from collections import Counter
	word_freq = Counter(keywords)
	return [word for word, _ in word_freq.most_common(max_keywords)]


	def format_timestamp(dt: Optional[datetime] = None) -> str:
	"""Format datetime to ISO string"""
	if dt is None:
	dt = datetime.now(timezone.utc)
	return dt.isoformat()


	def parse_json_safely(json_str: str, default: Any = None) -> Any:
	"""Safely parse JSON string"""
	import json
	try:
	return json.loads(json_str)
	except (json.JSONDecodeError, TypeError):
	return default if default is not None else {}


	def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
	"""Truncate text to maximum length"""
	if not text or len(text) <= max_length:
	return text
	return text[:max_length - len(suffix)] + suffix


	def calculate_confidence(scores: List[float]) -> float:
	"""Calculate average confidence from a list of scores"""
	if not scores:
	return 0.0
	return sum(scores) / len(scores)


	def validate_email_format(email: str) -> bool:
	"""Validate email format"""
	pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
	return bool(re.match(pattern, email))


	def sanitize_html(html_content: str) -> str:
	"""Remove potentially dangerous HTML tags"""
	if not html_content:
	return ""

	# Remove script tags
	html_content = re.sub(r'<script[^>]>.?</script>', '', html_content, flags=re.DOTALL \| re.IGNORECASE)

	# Remove style tags
	html_content = re.sub(r'<style[^>]>.?</style>', '', html_content, flags=re.DOTALL \| re.IGNORECASE)

	# Remove event handlers
	html_content = re.sub(r'\s+on\w+\s=\s["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)

	return html_content