niwayandm commited on
Commit ·
9625e2f
1
Parent(s): 0fe3308
Update emails to include normalized subject
Browse files- python/hubspot_emails.py +20 -1
python/hubspot_emails.py
CHANGED
|
@@ -80,7 +80,7 @@ EMAIL_PROPERTIES = [
|
|
| 80 |
# Email parsing
|
| 81 |
# -----------------------------------------------------------------------------
|
| 82 |
EMAIL_RE = re.compile(r'[\w\.\+\-]+@[\w\.\-]+\.\w+')
|
| 83 |
-
|
| 84 |
|
| 85 |
def parse_emails(raw: Optional[object]) -> List[str]:
|
| 86 |
if raw is None:
|
|
@@ -100,6 +100,24 @@ def parse_emails(raw: Optional[object]) -> List[str]:
|
|
| 100 |
candidates.extend(EMAIL_RE.findall(str(raw)))
|
| 101 |
return sorted({c.strip().lower() for c in candidates if c and c.strip()})
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
# -----------------------------------------------------------------------------
|
| 104 |
# Time helpers
|
| 105 |
# -----------------------------------------------------------------------------
|
|
@@ -263,6 +281,7 @@ def read_emails_by_ids(
|
|
| 263 |
email_metadata_data.append({
|
| 264 |
"email_id": record.id,
|
| 265 |
"subject": props.get("hs_email_subject"),
|
|
|
|
| 266 |
"from_email": props.get("hs_email_from_email") or "",
|
| 267 |
"to_emails": parse_emails(props.get("hs_email_to_email")),
|
| 268 |
"sent_at": sent_at_iso,
|
|
|
|
| 80 |
# Email parsing
|
| 81 |
# -----------------------------------------------------------------------------
|
| 82 |
EMAIL_RE = re.compile(r'[\w\.\+\-]+@[\w\.\-]+\.\w+')
|
| 83 |
+
SUBJECT_PREFIX_RE = re.compile(r'^(re:|fw:|fwd:)\s*', re.IGNORECASE)
|
| 84 |
|
| 85 |
def parse_emails(raw: Optional[object]) -> List[str]:
|
| 86 |
if raw is None:
|
|
|
|
| 100 |
candidates.extend(EMAIL_RE.findall(str(raw)))
|
| 101 |
return sorted({c.strip().lower() for c in candidates if c and c.strip()})
|
| 102 |
|
| 103 |
+
def normalize_subject(raw: Optional[str]) -> Optional[str]:
|
| 104 |
+
if not raw:
|
| 105 |
+
return None
|
| 106 |
+
|
| 107 |
+
s = raw.strip().lower()
|
| 108 |
+
|
| 109 |
+
# remove multiple prefixes
|
| 110 |
+
while True:
|
| 111 |
+
new_s = SUBJECT_PREFIX_RE.sub("", s)
|
| 112 |
+
if new_s == s:
|
| 113 |
+
break
|
| 114 |
+
s = new_s.strip()
|
| 115 |
+
|
| 116 |
+
# collapse whitespace
|
| 117 |
+
s = re.sub(r"\s+", " ", s).strip()
|
| 118 |
+
|
| 119 |
+
return s or None
|
| 120 |
+
|
| 121 |
# -----------------------------------------------------------------------------
|
| 122 |
# Time helpers
|
| 123 |
# -----------------------------------------------------------------------------
|
|
|
|
| 281 |
email_metadata_data.append({
|
| 282 |
"email_id": record.id,
|
| 283 |
"subject": props.get("hs_email_subject"),
|
| 284 |
+
"normalized_subject": normalize_subject(props.get("hs_email_subject")),
|
| 285 |
"from_email": props.get("hs_email_from_email") or "",
|
| 286 |
"to_emails": parse_emails(props.get("hs_email_to_email")),
|
| 287 |
"sent_at": sent_at_iso,
|