wuhp commited on
Commit
c720489
·
verified ·
1 Parent(s): b76588a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -22
app.py CHANGED
@@ -117,9 +117,14 @@ def is_news_like(subject: str, body: str, from_domain: str) -> bool:
117
  # -------- System/notification heuristics (bucket as cluster -2) --------
118
  NOTIFY_PATTERNS = [
119
  r"\bno[-\s]?reply\b", r"do not reply", r"security alert", r"new sign[-\s]?in",
120
- r"verification code", r"two[-\s]?factor", r"\botp\b", r"\bcode[:\s]",
121
  r"itunes connect", r"apple id", r"your google account", r"used (?:a )?new browser",
122
- r"unable to determine", r"reset your password", r"\balert\b"
 
 
 
 
 
123
  ]
124
  NOTIFY_RE = re.compile("|".join(NOTIFY_PATTERNS), re.I)
125
  def is_notification_like(subject: str, body: str, from_email: str, from_domain: str) -> bool:
@@ -169,24 +174,49 @@ MONTHS = {
169
  "january","february","march","april","june","july","august","september",
170
  "october","november","december"
171
  }
172
- # Extra junk/HTML/MIME terms to suppress in labels
 
173
  STOP_TERMS = {
174
  "div","span","nbsp","href","src","img","class","style","align","border","cid",
175
  "content","content-type","multipart","alternative","quoted","printable","utf",
176
  "windows-1255","iso-8859","us-ascii","html","plain","attachment","filename",
177
- # generic meta-ish that dominated some clusters
178
- "type","id","service","person","generated"
179
  }
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
182
  YEAR_RE = re.compile(r"^(19|20)\d{2}$")
183
  NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
184
  ONE_CHAR_RE = re.compile(r"^.$")
185
 
 
 
 
186
  def _is_junk_term(t: str) -> bool:
187
  tl = t.lower()
188
- if tl in STOP_TERMS: return True
189
- if tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
190
  return True
191
  if EMAIL_LIKE_RE.search(tl): return True
192
  if YEAR_RE.match(tl): return True
@@ -241,6 +271,9 @@ def strip_quotes_and_sigs(text: str) -> str:
241
  cut = idx if (cut is None or idx < cut) else cut
242
  if cut is not None:
243
  text = text[:cut]
 
 
 
244
  return text.strip()
245
 
246
  def parse_name_email(s: str) -> Tuple[str, str]:
@@ -581,8 +614,16 @@ def enrich_text(row: pd.Series) -> str:
581
  tokens.append(lang_tok)
582
  return (t + " " + " ".join(tokens)).strip()
583
 
584
- # =================== Cluster labeling: improved PMI + class-TFIDF ===================
585
- def cluster_labels_pmi_bigram(texts, labels, topn=6):
 
 
 
 
 
 
 
 
586
  import math as _math
587
  from collections import Counter, defaultdict
588
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -596,7 +637,9 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
596
  if tl in STOP_TERMS: return True # extra HTML/MIME junk
597
  if tl in HEADER_STOP: return True
598
  if "@" in tl: return True
599
- # drop tokens that are basically punctuation blobs (keep apostrophes)
 
 
600
  if re.search(r"[^\w\-']", tl):
601
  if "’" not in tl and "'" not in tl:
602
  return True
@@ -612,19 +655,37 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
612
  glob_bg = Counter()
613
  per_c_bg = defaultdict(Counter)
614
  per_c_texts = defaultdict(list)
 
 
 
 
 
 
 
615
 
616
- for txt, c in zip(texts, labels):
 
617
  toks = tokenize_clean(txt)
618
  bgs = set(bigrams(toks))
619
  glob_bg.update(bgs)
620
- per_c_bg[int(c)].update(bgs)
621
- per_c_texts[int(c)].append(" ".join(toks))
 
 
 
 
 
 
 
 
622
 
623
  labels_out = {}
624
  total_bg = sum(glob_bg.values()) + 1e-12
625
 
626
  for c in sorted(set(int(x) for x in labels)):
627
- # PMI bigrams
 
 
628
  scores = []
629
  total_c = sum(per_c_bg[c].values()) + 1e-12
630
  for bg, cnt in per_c_bg[c].most_common(2000):
@@ -632,11 +693,16 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
632
  p_bg = (glob_bg[bg] / total_bg)
633
  if p_bg > 0 and p_bg_c > 0:
634
  score = _math.log(p_bg_c) - _math.log(p_bg)
 
 
 
 
 
635
  scores.append((score, bg))
636
  scores.sort(reverse=True)
637
  top_bi = [bg for _, bg in scores[: max(2, topn//2) ]]
638
 
639
- # class-TFIDF unigrams (cluster doc vs. background doc)
640
  docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
641
  docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k!=c), [])) or " "]
642
  corpus = [docs_c[0], docs_bg[0]]
@@ -647,7 +713,22 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
647
  X = vec.fit_transform(corpus)
648
  vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
649
  row = X[0].toarray().ravel()
650
- top_idx = row.argsort()[::-1][: max(0, topn - len(top_bi)) ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  top_uni = []
652
  for i in top_idx:
653
  tok = vocab[i]
@@ -696,7 +777,7 @@ def merge_close_clusters(labels, centers, thresh=0.92):
696
  while parent[a]!=a: a=parent[a]
697
  return a
698
  for i in range(k):
699
- for j in range(i+1, k):
700
  if sim[i,j] >= thresh:
701
  pi, pj = find(i), find(j)
702
  if pi!=pj: parent[pj]=pi
@@ -1037,6 +1118,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1037
 
1038
  # Enriched texts (adds __HAS_*__ flags + __LANG__)
1039
  texts = list(df_main.apply(enrich_text, axis=1))
 
1040
 
1041
  # === Vectorization ===
1042
  ngram_range = (1, 2) if use_bigrams else (1, 1)
@@ -1049,6 +1131,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1049
  token_pattern=TOKEN_PATTERN,
1050
  lowercase=True,
1051
  dtype=np.float32,
 
1052
  )
1053
  TF = count_vec.fit_transform(texts)
1054
  bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
@@ -1060,8 +1143,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1060
  )
1061
  X_char = char_vec.fit_transform(texts)
1062
 
1063
- # Down-weight char-grams so they don't dominate geometry
1064
- X_full = hstack([X_word, X_char * 0.25], format="csr")
1065
  d_word = X_word.shape[1]
1066
  d_char = X_char.shape[1]
1067
  d_full = X_full.shape[1]
@@ -1125,7 +1208,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1125
 
1126
  # Attach clustering back to df_main
1127
  df_main["cluster_id"] = labels
1128
- term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
1129
  df_main["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
1130
  df_main["anomaly_score"] = anomaly_scores
1131
 
@@ -1225,7 +1308,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1225
 
1226
  status_md = (
1227
  f"**Processed {len(df):,} emails** \n"
1228
- f"Word feats (BM25): {d_word:,} | Char feats: {d_char:,} (x0.25) | Total: {d_full:,} \n"
1229
  f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
1230
  f"k = {k} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'} | "
1231
  f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
@@ -1355,7 +1438,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1355
  else:
1356
  q_emb = q_vec_full
1357
 
1358
- # ensure the mapping lines up with df_main order (exclude -1 and -2)
1359
  mask = ~df["cluster_id"].isin([-1, -2])
1360
  filtered_df = df[mask]
1361
 
 
117
  # -------- System/notification heuristics (bucket as cluster -2) --------
118
  NOTIFY_PATTERNS = [
119
  r"\bno[-\s]?reply\b", r"do not reply", r"security alert", r"new sign[-\s]?in",
120
+ r"verification code", r"two[-\s]?factor|\b2fa\b", r"\botp\b", r"\bcode[:\s]",
121
  r"itunes connect", r"apple id", r"your google account", r"used (?:a )?new browser",
122
+ r"unable to determine", r"reset your password", r"\balert\b",
123
+ # bounces / gateways / quarantine
124
+ r"mailer[-\s]?daemon", r"\bpostmaster\b", r"delivery status notification",
125
+ r"undeliverable", r"delivery failure", r"returned mail", r"mail delivery subsystem",
126
+ r"proofpoint", r"mimecast", r"dmarc", r"\bspf\b", r"\bdkim\b", r"quarantine",
127
+ r"spam digest", r"phishing", r"security gateway", r"mail[-\s]?secure|secure message"
128
  ]
129
  NOTIFY_RE = re.compile("|".join(NOTIFY_PATTERNS), re.I)
130
  def is_notification_like(subject: str, body: str, from_email: str, from_domain: str) -> bool:
 
174
  "january","february","march","april","june","july","august","september",
175
  "october","november","december"
176
  }
177
+
178
+ # Extra junk/HTML/MIME terms to suppress in labels (expanded)
179
  STOP_TERMS = {
180
  "div","span","nbsp","href","src","img","class","style","align","border","cid",
181
  "content","content-type","multipart","alternative","quoted","printable","utf",
182
  "windows-1255","iso-8859","us-ascii","html","plain","attachment","filename",
183
+ "type","id","service","person","generated","fyi"
 
184
  }
185
 
186
+ # NEW: broader stop buckets for labels *and* features
187
+ AUX_STOP = {
188
+ "will","would","should","could","can","cant","cannot","did","do","does","done",
189
+ "have","has","had","having","get","got","make","made","let","need","want",
190
+ "not","dont","didnt","isnt","arent","wasnt","werent","im","youre","hes","shes",
191
+ "weve","ive","theyre","its","ok","okay","pls","please","thx","thanks","regards","best",
192
+ "hi","hello","dear","re","fw","fwd","via","kind"
193
+ }
194
+ CTA_STOP = {
195
+ "click","here","unsubscribe","view","browser","mailto","reply","iphone","android",
196
+ "press","link","below","above","update","newsletter","manage","preferences",
197
+ "לחץ","כאן","נשלח","מה","מה-iphone","הטלפון"
198
+ }
199
+ TECH_META = {
200
+ "quot","nbsp","cid","href","src","img","class","style","div","span","http","https",
201
+ "content","content-type","multipart","alternative","quoted","printable","utf",
202
+ "windows-1255","iso-8859","us-ascii","attachment","filename"
203
+ }
204
+ ZH_HEADER_STOP = {"发送时间","星期","星期一","星期二","星期三","星期四","星期五","星期六","星期日","转发","主题","收件人","发件人"}
205
+ HE_EXTRA_STOP = {"עם","או"}
206
+
207
+ # fold into STOP_TERMS and build a vectorizer stoplist
208
+ STOP_TERMS |= AUX_STOP | CTA_STOP | TECH_META | ZH_HEADER_STOP | HE_EXTRA_STOP
209
  EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
210
  YEAR_RE = re.compile(r"^(19|20)\d{2}$")
211
  NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
212
  ONE_CHAR_RE = re.compile(r"^.$")
213
 
214
+ # This stoplist is used by the CountVectorizer
215
+ STOPWORD_FOR_VEC = EN_STOP | HE_STOP | STOP_TERMS
216
+
217
  def _is_junk_term(t: str) -> bool:
218
  tl = t.lower()
219
+ if tl in STOP_TERMS or tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
 
220
  return True
221
  if EMAIL_LIKE_RE.search(tl): return True
222
  if YEAR_RE.match(tl): return True
 
271
  cut = idx if (cut is None or idx < cut) else cut
272
  if cut is not None:
273
  text = text[:cut]
274
+ # extra safety for mobile signatures that sneak through
275
+ text = re.sub(r"\n\s*sent from my .*?$", "", text, flags=re.I|re.M)
276
+ text = re.sub(r"\n\s*(נשלח מה-?iphone).*?$", "", text, flags=re.I|re.M)
277
  return text.strip()
278
 
279
  def parse_name_email(s: str) -> Tuple[str, str]:
 
614
  tokens.append(lang_tok)
615
  return (t + " " + " ".join(tokens)).strip()
616
 
617
+ # =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST ===================
618
+ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alpha=0.75):
619
+ """
620
+ Create human-readable labels per cluster using:
621
+ 1) PMI bigrams (cluster vs global) + subject coverage boost
622
+ 2) Class-TFIDF unigrams (cluster vs rest) + subject coverage boost
623
+
624
+ `subjects`: list of subject strings aligned with `texts`
625
+ `subject_alpha`: weight added per token = alpha * coverage_in_subjects (0..1)
626
+ """
627
  import math as _math
628
  from collections import Counter, defaultdict
629
  from sklearn.feature_extraction.text import TfidfVectorizer
 
637
  if tl in STOP_TERMS: return True # extra HTML/MIME junk
638
  if tl in HEADER_STOP: return True
639
  if "@" in tl: return True
640
+ # drop short ASCII like "eb/ys/yl"
641
+ if tl.isascii() and len(tl) <= 2: return True
642
+ # punctuation blobs (keep apostrophes)
643
  if re.search(r"[^\w\-']", tl):
644
  if "’" not in tl and "'" not in tl:
645
  return True
 
655
  glob_bg = Counter()
656
  per_c_bg = defaultdict(Counter)
657
  per_c_texts = defaultdict(list)
658
+ per_c_doc_count = defaultdict(int)
659
+
660
+ # SUBJECT presence (unique tokens/bigrams per subject per doc)
661
+ per_c_subj_uni_docs = defaultdict(Counter)
662
+ per_c_subj_bg_docs = defaultdict(Counter)
663
+
664
+ have_subjects = subjects is not None and len(subjects) == len(texts)
665
 
666
+ for idx, (txt, c) in enumerate(zip(texts, labels)):
667
+ c = int(c)
668
  toks = tokenize_clean(txt)
669
  bgs = set(bigrams(toks))
670
  glob_bg.update(bgs)
671
+ per_c_bg[c].update(bgs)
672
+ per_c_texts[c].append(" ".join(toks))
673
+ per_c_doc_count[c] += 1
674
+
675
+ if have_subjects:
676
+ subj_toks = tokenize_clean(subjects[idx] or "")
677
+ subj_uni_set = set(subj_toks)
678
+ subj_bg_set = set(bigrams(subj_toks))
679
+ per_c_subj_uni_docs[c].update(subj_uni_set)
680
+ per_c_subj_bg_docs[c].update(subj_bg_set)
681
 
682
  labels_out = {}
683
  total_bg = sum(glob_bg.values()) + 1e-12
684
 
685
  for c in sorted(set(int(x) for x in labels)):
686
+ n_docs_c = max(1, per_c_doc_count[c])
687
+
688
+ # PMI bigrams (+ subject boost)
689
  scores = []
690
  total_c = sum(per_c_bg[c].values()) + 1e-12
691
  for bg, cnt in per_c_bg[c].most_common(2000):
 
693
  p_bg = (glob_bg[bg] / total_bg)
694
  if p_bg > 0 and p_bg_c > 0:
695
  score = _math.log(p_bg_c) - _math.log(p_bg)
696
+ # subject coverage boost: fraction of cluster docs whose SUBJECT contains this bigram
697
+ cov = 0.0
698
+ if have_subjects:
699
+ cov = per_c_subj_bg_docs[c][bg] / n_docs_c
700
+ score = score + subject_alpha * cov
701
  scores.append((score, bg))
702
  scores.sort(reverse=True)
703
  top_bi = [bg for _, bg in scores[: max(2, topn//2) ]]
704
 
705
+ # class-TFIDF unigrams (cluster doc vs. background doc) + subject boost
706
  docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
707
  docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k!=c), [])) or " "]
708
  corpus = [docs_c[0], docs_bg[0]]
 
713
  X = vec.fit_transform(corpus)
714
  vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
715
  row = X[0].toarray().ravel()
716
+
717
+ # Build subject coverage vector over this vocab
718
+ subj_cov = np.zeros_like(row)
719
+ if have_subjects:
720
+ vocab_index = {t:i for i,t in enumerate(vocab)}
721
+ for tok, cnt_docs in per_c_subj_uni_docs[c].items():
722
+ if tok in vocab_index:
723
+ subj_cov[vocab_index[tok]] = cnt_docs / n_docs_c # 0..1
724
+
725
+ # Apply boost (only to non-junk tokens)
726
+ row_boosted = row.copy()
727
+ for i, tok in enumerate(vocab):
728
+ if subj_cov[i] > 0 and not is_junk_token(tok):
729
+ row_boosted[i] = row[i] + subject_alpha * float(subj_cov[i])
730
+
731
+ top_idx = row_boosted.argsort()[::-1][: max(0, topn - len(top_bi)) ]
732
  top_uni = []
733
  for i in top_idx:
734
  tok = vocab[i]
 
777
  while parent[a]!=a: a=parent[a]
778
  return a
779
  for i in range(k):
780
+ for j in range(i+1, j := k):
781
  if sim[i,j] >= thresh:
782
  pi, pj = find(i), find(j)
783
  if pi!=pj: parent[pj]=pi
 
1118
 
1119
  # Enriched texts (adds __HAS_*__ flags + __LANG__)
1120
  texts = list(df_main.apply(enrich_text, axis=1))
1121
+ subjects_only = list(df_main["subject"].fillna(""))
1122
 
1123
  # === Vectorization ===
1124
  ngram_range = (1, 2) if use_bigrams else (1, 1)
 
1131
  token_pattern=TOKEN_PATTERN,
1132
  lowercase=True,
1133
  dtype=np.float32,
1134
+ stop_words=STOPWORD_FOR_VEC, # <-- use expanded stoplist
1135
  )
1136
  TF = count_vec.fit_transform(texts)
1137
  bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
 
1143
  )
1144
  X_char = char_vec.fit_transform(texts)
1145
 
1146
+ # Down-weight char-grams so they don't dominate geometry (slightly lower)
1147
+ X_full = hstack([X_word, X_char * 0.20], format="csr")
1148
  d_word = X_word.shape[1]
1149
  d_char = X_char.shape[1]
1150
  d_full = X_full.shape[1]
 
1208
 
1209
  # Attach clustering back to df_main
1210
  df_main["cluster_id"] = labels
1211
+ term_names = cluster_labels_pmi_bigram(texts, labels, subjects=subjects_only, topn=6, subject_alpha=0.75)
1212
  df_main["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
1213
  df_main["anomaly_score"] = anomaly_scores
1214
 
 
1308
 
1309
  status_md = (
1310
  f"**Processed {len(df):,} emails** \n"
1311
+ f"Word feats (BM25): {d_word:,} | Char feats: {d_char:,} (x0.20) | Total: {d_full:,} \n"
1312
  f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
1313
  f"k = {k} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'} | "
1314
  f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
 
1438
  else:
1439
  q_emb = q_vec_full
1440
 
1441
+ # align with df_main order (exclude -1 and -2)
1442
  mask = ~df["cluster_id"].isin([-1, -2])
1443
  filtered_df = df[mask]
1444