criticalmaz commited on
Commit
bcb6b46
·
verified ·
1 Parent(s): 38aece4

Update utils.py from anycoder

Browse files
Files changed (1) hide show
  1. utils.py +138 -0
utils.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for the Email Intelligence Platform
3
+ """
4
+
5
+ import re
6
+ import hashlib
7
+ from typing import List, Dict, Any, Optional
8
+ from datetime import datetime, timezone
9
+
10
+
11
+ def clean_text(text: Optional[str]) -> str:
12
+ """Clean and normalize text for analysis"""
13
+ if not text:
14
+ return ""
15
+
16
+ # Convert to lowercase
17
+ text = text.lower()
18
+
19
+ # Remove extra whitespace
20
+ text = re.sub(r'\s+', ' ', text.strip())
21
+
22
+ # Remove special characters but keep basic punctuation
23
+ text = re.sub(r'[^\w\s.,!?-]', '', text)
24
+
25
+ return text
26
+
27
+
28
+ def extract_email_address(sender_header: str) -> str:
29
+ """Extract email address from a sender header"""
30
+ if not sender_header:
31
+ return ""
32
+
33
+ # Try to extract email from format "Name <email@domain.com>"
34
+ match = re.search(r'<([^>]+)>', sender_header)
35
+ if match:
36
+ return match.group(1).lower()
37
+
38
+ # If no angle brackets, assume the whole string is an email
39
+ if '@' in sender_header:
40
+ return sender_header.strip().lower()
41
+
42
+ return ""
43
+
44
+
45
+ def extract_domain(email_address: str) -> str:
46
+ """Extract domain from email address"""
47
+ if '@' in email_address:
48
+ return email_address.split('@')[-1].lower()
49
+ return ""
50
+
51
+
52
+ def generate_id(prefix: str, content: str) -> str:
53
+ """Generate a unique ID based on content hash"""
54
+ timestamp = datetime.now(timezone.utc).timestamp()
55
+ hash_input = f"{content}_{timestamp}"
56
+ content_hash = hashlib.sha256(hash_input.encode()).hexdigest()[:8]
57
+ return f"{prefix}_{content_hash}"
58
+
59
+
60
+ def extract_keywords(text: str, min_length: int = 4, max_keywords: int = 10) -> List[str]:
61
+ """Extract keywords from text"""
62
+ if not text:
63
+ return []
64
+
65
+ # Common stop words to filter out
66
+ stop_words = {
67
+ 'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had',
68
+ 'her', 'was', 'one', 'our', 'out', 'has', 'have', 'been', 'were', 'they',
69
+ 'this', 'that', 'with', 'from', 'your', 'will', 'would', 'could', 'should',
70
+ 'what', 'when', 'where', 'which', 'their', 'there', 'these', 'those'
71
+ }
72
+
73
+ # Extract words
74
+ words = re.findall(r'\b\w+\b', text.lower())
75
+
76
+ # Filter words
77
+ keywords = [
78
+ word for word in words
79
+ if len(word) >= min_length and word not in stop_words and word.isalpha()
80
+ ]
81
+
82
+ # Count frequency and return top keywords
83
+ from collections import Counter
84
+ word_freq = Counter(keywords)
85
+ return [word for word, _ in word_freq.most_common(max_keywords)]
86
+
87
+
88
+ def format_timestamp(dt: Optional[datetime] = None) -> str:
89
+ """Format datetime to ISO string"""
90
+ if dt is None:
91
+ dt = datetime.now(timezone.utc)
92
+ return dt.isoformat()
93
+
94
+
95
+ def parse_json_safely(json_str: str, default: Any = None) -> Any:
96
+ """Safely parse JSON string"""
97
+ import json
98
+ try:
99
+ return json.loads(json_str)
100
+ except (json.JSONDecodeError, TypeError):
101
+ return default if default is not None else {}
102
+
103
+
104
+ def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
105
+ """Truncate text to maximum length"""
106
+ if not text or len(text) <= max_length:
107
+ return text
108
+ return text[:max_length - len(suffix)] + suffix
109
+
110
+
111
+ def calculate_confidence(scores: List[float]) -> float:
112
+ """Calculate average confidence from a list of scores"""
113
+ if not scores:
114
+ return 0.0
115
+ return sum(scores) / len(scores)
116
+
117
+
118
+ def validate_email_format(email: str) -> bool:
119
+ """Validate email format"""
120
+ pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
121
+ return bool(re.match(pattern, email))
122
+
123
+
124
+ def sanitize_html(html_content: str) -> str:
125
+ """Remove potentially dangerous HTML tags"""
126
+ if not html_content:
127
+ return ""
128
+
129
+ # Remove script tags
130
+ html_content = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
131
+
132
+ # Remove style tags
133
+ html_content = re.sub(r'<style[^>]*>.*?</style>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
134
+
135
+ # Remove event handlers
136
+ html_content = re.sub(r'\s+on\w+\s*=\s*["\'][^"\']*["\']', '', html_content, flags=re.IGNORECASE)
137
+
138
+ return html_content