niwayandm commited on
Commit
9625e2f
·
1 Parent(s): 0fe3308

Update emails to include normalized subject

Browse files
Files changed (1) hide show
  1. python/hubspot_emails.py +20 -1
python/hubspot_emails.py CHANGED
@@ -80,7 +80,7 @@ EMAIL_PROPERTIES = [
80
  # Email parsing
81
  # -----------------------------------------------------------------------------
82
  EMAIL_RE = re.compile(r'[\w\.\+\-]+@[\w\.\-]+\.\w+')
83
-
84
 
85
  def parse_emails(raw: Optional[object]) -> List[str]:
86
  if raw is None:
@@ -100,6 +100,24 @@ def parse_emails(raw: Optional[object]) -> List[str]:
100
  candidates.extend(EMAIL_RE.findall(str(raw)))
101
  return sorted({c.strip().lower() for c in candidates if c and c.strip()})
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  # -----------------------------------------------------------------------------
104
  # Time helpers
105
  # -----------------------------------------------------------------------------
@@ -263,6 +281,7 @@ def read_emails_by_ids(
263
  email_metadata_data.append({
264
  "email_id": record.id,
265
  "subject": props.get("hs_email_subject"),
 
266
  "from_email": props.get("hs_email_from_email") or "",
267
  "to_emails": parse_emails(props.get("hs_email_to_email")),
268
  "sent_at": sent_at_iso,
 
80
  # Email parsing
81
  # -----------------------------------------------------------------------------
82
  EMAIL_RE = re.compile(r'[\w\.\+\-]+@[\w\.\-]+\.\w+')
83
+ SUBJECT_PREFIX_RE = re.compile(r'^(re:|fw:|fwd:)\s*', re.IGNORECASE)
84
 
85
  def parse_emails(raw: Optional[object]) -> List[str]:
86
  if raw is None:
 
100
  candidates.extend(EMAIL_RE.findall(str(raw)))
101
  return sorted({c.strip().lower() for c in candidates if c and c.strip()})
102
 
103
+ def normalize_subject(raw: Optional[str]) -> Optional[str]:
104
+ if not raw:
105
+ return None
106
+
107
+ s = raw.strip().lower()
108
+
109
+ # remove multiple prefixes
110
+ while True:
111
+ new_s = SUBJECT_PREFIX_RE.sub("", s)
112
+ if new_s == s:
113
+ break
114
+ s = new_s.strip()
115
+
116
+ # collapse whitespace
117
+ s = re.sub(r"\s+", " ", s).strip()
118
+
119
+ return s or None
120
+
121
  # -----------------------------------------------------------------------------
122
  # Time helpers
123
  # -----------------------------------------------------------------------------
 
281
  email_metadata_data.append({
282
  "email_id": record.id,
283
  "subject": props.get("hs_email_subject"),
284
+ "normalized_subject": normalize_subject(props.get("hs_email_subject")),
285
  "from_email": props.get("hs_email_from_email") or "",
286
  "to_emails": parse_emails(props.get("hs_email_to_email")),
287
  "sent_at": sent_at_iso,