File size: 2,572 Bytes
9ea5e05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import regex as re

# Actionability and clear asks grounded in research about explicit CTAs and timeliness.
ASK_PATTERNS = [
 r"(?i)\bplease (review|confirm|approve|advise|respond|sign|share)\b",
 r"(?i)\b(schedule|book|call|join|attend|register)\b",
 r"(?i)\b(action required|needs your approval|need your input)\b",
 r"(?i)\bby (mon|tue|wed|thu|fri|sat|sun|\d{1,2})(?:\s*(?:am|pm)?)?\b",
 r"(?i)^(please|review|confirm|approve|send|attach|reply|rsvp)\b",
 r"(?i)\b(could|can|would|please) (we|you) (move|change|reschedule|confirm|review|proceed)\b",
 r"(?i)\b(reschedule|move|change|follow up|following up)\b",
]

SUBJECT_USEFUL = [
 r"(?i)\breview\b", r"(?i)\bupdate\b", r"(?i)\binvoice\b", r"(?i)\bsummary\b", r"(?i)\bmetrics?\b",
 r"(?i)\bby (mon|tue|wed|thu|fri|\d{1,2})\b",
 r"(?i)\bschedul(?:e|ing)\b",
 r"(?i)\b(reminder|follow\s*up|deadline)\b",
]

GREETINGS = [r"(?i)^(hi|hello|hey|good (morning|afternoon|evening)|dear)\b"]
SIGNOFFS  = [r"(?i)\b(regards|best|sincerely|thanks|thank you|cheers)\b"]

# Patterns for tone analysis
PASSIVE_AGGRESSIVE = [
  r"(?i)per my last email",
  r"(?i)as previously (stated|mentioned)",
  r"(?i)as I (said|mentioned)",
  r"(?i)kindly (note|remind)",
  r"(?i)actually,?",
  r"(?i)you should have",
  r"(?i)if you had",
  r"(?i)hope that makes sense",
]

HOSTILE = [
  r"(?i)or else",
  r"(?i)you (will|shall) suffer",
  r"(?i)make sure you (guys\s+)?suffer",
  r"(?i)threat(en|s|ening)?",
  r"(?i)shut up",
]

# Spam-related regex groups grounded in industry guidance (urgency, rewards, marketing calls)
SPAM_URGENCY = [
  r"(?i)act now",
  r"(?i)limited time",
  r"(?i)expires in\b",
  r"(?i)24\s*hours",
  r"(?i)once in a lifetime",
  r"(?i)don['’]t miss out",
  r"(?i)urgent",
]

SPAM_REWARD = [
  r"(?i)congratulations",
  r"(?i)selected",
  r"(?i)exclusive (reward|deal|offer)?",
  r"(?i)reward",
  r"(?i)prize",
  r"(?i)cash",
  r"(?i)win(ner)?\b",
  r"(?i)\$\s*\d{2,}",
]

SPAM_CALLS = [
  r"(?i)click here",
  r"(?i)claim (your )?(prize|reward|offer)",
  r"(?i)redeem now",
]

SPAM_MARKETING = [
  r"(?i)free (trial|access|gift)",
  r"(?i)no obligation",
  r"(?i)risk[- ]?free",
]

HOMOPHONES = [
 ("its", "it's"), ("your","you're"), ("there","their"), ("there","they're"),
 ("to","too"), ("than","then"), ("affect","effect")
]

def any_match(patterns, text) -> bool:
    return any(re.search(p, text or "") for p in patterns)

def find_spans(patterns, text):
    spans=[]
    for p in patterns:
        for m in re.finditer(p, text or ""):
            spans.append((m.group(0), m.start(), m.end()))
    return spans