Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Sep 1, 2025

Commit

c720489

verified ·

1 Parent(s): b76588a

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -22

app.py CHANGED Viewed

@@ -117,9 +117,14 @@ def is_news_like(subject: str, body: str, from_domain: str) -> bool:
 # -------- System/notification heuristics (bucket as cluster -2) --------
 NOTIFY_PATTERNS = [
     r"\bno[-\s]?reply\b", r"do not reply", r"security alert", r"new sign[-\s]?in",
-    r"verification code", r"two[-\s]?factor", r"\botp\b", r"\bcode[:\s]",
     r"itunes connect", r"apple id", r"your google account", r"used (?:a )?new browser",
-    r"unable to determine", r"reset your password", r"\balert\b"
 ]
 NOTIFY_RE = re.compile("|".join(NOTIFY_PATTERNS), re.I)
 def is_notification_like(subject: str, body: str, from_email: str, from_domain: str) -> bool:
@@ -169,24 +174,49 @@ MONTHS = {
     "january","february","march","april","june","july","august","september",
     "october","november","december"
 }
-# Extra junk/HTML/MIME terms to suppress in labels
 STOP_TERMS = {
     "div","span","nbsp","href","src","img","class","style","align","border","cid",
     "content","content-type","multipart","alternative","quoted","printable","utf",
     "windows-1255","iso-8859","us-ascii","html","plain","attachment","filename",
-    # generic meta-ish that dominated some clusters
-    "type","id","service","person","generated"
 }
 EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
 YEAR_RE = re.compile(r"^(19|20)\d{2}$")
 NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
 ONE_CHAR_RE = re.compile(r"^.$")
 def _is_junk_term(t: str) -> bool:
     tl = t.lower()
-    if tl in STOP_TERMS: return True
-    if tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
         return True
     if EMAIL_LIKE_RE.search(tl): return True
     if YEAR_RE.match(tl): return True
@@ -241,6 +271,9 @@ def strip_quotes_and_sigs(text: str) -> str:
             cut = idx if (cut is None or idx < cut) else cut
     if cut is not None:
         text = text[:cut]
     return text.strip()
 def parse_name_email(s: str) -> Tuple[str, str]:
@@ -581,8 +614,16 @@ def enrich_text(row: pd.Series) -> str:
     tokens.append(lang_tok)
     return (t + " " + " ".join(tokens)).strip()
-# =================== Cluster labeling: improved PMI + class-TFIDF ===================
-def cluster_labels_pmi_bigram(texts, labels, topn=6):
     import math as _math
     from collections import Counter, defaultdict
     from sklearn.feature_extraction.text import TfidfVectorizer
@@ -596,7 +637,9 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
         if tl in STOP_TERMS: return True             # extra HTML/MIME junk
         if tl in HEADER_STOP: return True
         if "@" in tl: return True
-        # drop tokens that are basically punctuation blobs (keep apostrophes)
         if re.search(r"[^\w\-']", tl):
             if "’" not in tl and "'" not in tl:
                 return True
@@ -612,19 +655,37 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
     glob_bg = Counter()
     per_c_bg = defaultdict(Counter)
     per_c_texts = defaultdict(list)
-    for txt, c in zip(texts, labels):
         toks = tokenize_clean(txt)
         bgs = set(bigrams(toks))
         glob_bg.update(bgs)
-        per_c_bg[int(c)].update(bgs)
-        per_c_texts[int(c)].append(" ".join(toks))
     labels_out = {}
     total_bg = sum(glob_bg.values()) + 1e-12
     for c in sorted(set(int(x) for x in labels)):
-        # PMI bigrams
         scores = []
         total_c = sum(per_c_bg[c].values()) + 1e-12
         for bg, cnt in per_c_bg[c].most_common(2000):
@@ -632,11 +693,16 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
             p_bg   = (glob_bg[bg] / total_bg)
             if p_bg > 0 and p_bg_c > 0:
                 score = _math.log(p_bg_c) - _math.log(p_bg)
                 scores.append((score, bg))
         scores.sort(reverse=True)
         top_bi = [bg for _, bg in scores[: max(2, topn//2) ]]
-        # class-TFIDF unigrams (cluster doc vs. background doc)
         docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
         docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k!=c), [])) or " "]
         corpus = [docs_c[0], docs_bg[0]]
@@ -647,7 +713,22 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
         X = vec.fit_transform(corpus)
         vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
         row = X[0].toarray().ravel()
-        top_idx = row.argsort()[::-1][: max(0, topn - len(top_bi)) ]
         top_uni = []
         for i in top_idx:
             tok = vocab[i]
@@ -696,7 +777,7 @@ def merge_close_clusters(labels, centers, thresh=0.92):
         while parent[a]!=a: a=parent[a]
         return a
     for i in range(k):
-        for j in range(i+1, k):
             if sim[i,j] >= thresh:
                 pi, pj = find(i), find(j)
                 if pi!=pj: parent[pj]=pi
@@ -1037,6 +1118,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         # Enriched texts (adds __HAS_*__ flags + __LANG__)
         texts = list(df_main.apply(enrich_text, axis=1))
         # === Vectorization ===
         ngram_range = (1, 2) if use_bigrams else (1, 1)
@@ -1049,6 +1131,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             token_pattern=TOKEN_PATTERN,
             lowercase=True,
             dtype=np.float32,
         )
         TF = count_vec.fit_transform(texts)
         bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
@@ -1060,8 +1143,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         )
         X_char = char_vec.fit_transform(texts)
-        # Down-weight char-grams so they don't dominate geometry
-        X_full = hstack([X_word, X_char * 0.25], format="csr")
         d_word = X_word.shape[1]
         d_char = X_char.shape[1]
         d_full = X_full.shape[1]
@@ -1125,7 +1208,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         # Attach clustering back to df_main
         df_main["cluster_id"] = labels
-        term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
         df_main["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
         df_main["anomaly_score"] = anomaly_scores
@@ -1225,7 +1308,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         status_md = (
             f"**Processed {len(df):,} emails**  \n"
-            f"Word feats (BM25): {d_word:,}  |  Char feats: {d_char:,} (x0.25)  |  Total: {d_full:,}  \n"
             f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims  |  ' if use_lsa else ''}"
             f"k = {k}  |  Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'}  |  "
             f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
@@ -1355,7 +1438,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         else:
             q_emb = q_vec_full
-        # ensure the mapping lines up with df_main order (exclude -1 and -2)
         mask = ~df["cluster_id"].isin([-1, -2])
         filtered_df = df[mask]

 # -------- System/notification heuristics (bucket as cluster -2) --------
 NOTIFY_PATTERNS = [
     r"\bno[-\s]?reply\b", r"do not reply", r"security alert", r"new sign[-\s]?in",
+    r"verification code", r"two[-\s]?factor|\b2fa\b", r"\botp\b", r"\bcode[:\s]",
     r"itunes connect", r"apple id", r"your google account", r"used (?:a )?new browser",
+    r"unable to determine", r"reset your password", r"\balert\b",
+    # bounces / gateways / quarantine
+    r"mailer[-\s]?daemon", r"\bpostmaster\b", r"delivery status notification",
+    r"undeliverable", r"delivery failure", r"returned mail", r"mail delivery subsystem",
+    r"proofpoint", r"mimecast", r"dmarc", r"\bspf\b", r"\bdkim\b", r"quarantine",
+    r"spam digest", r"phishing", r"security gateway", r"mail[-\s]?secure|secure message"
 ]
 NOTIFY_RE = re.compile("|".join(NOTIFY_PATTERNS), re.I)
 def is_notification_like(subject: str, body: str, from_email: str, from_domain: str) -> bool:
     "january","february","march","april","june","july","august","september",
     "october","november","december"
 }
+# Extra junk/HTML/MIME terms to suppress in labels (expanded)
 STOP_TERMS = {
     "div","span","nbsp","href","src","img","class","style","align","border","cid",
     "content","content-type","multipart","alternative","quoted","printable","utf",
     "windows-1255","iso-8859","us-ascii","html","plain","attachment","filename",
+    "type","id","service","person","generated","fyi"
 }
+# NEW: broader stop buckets for labels *and* features
+AUX_STOP = {
+    "will","would","should","could","can","cant","cannot","did","do","does","done",
+    "have","has","had","having","get","got","make","made","let","need","want",
+    "not","dont","didnt","isnt","arent","wasnt","werent","im","youre","hes","shes",
+    "weve","ive","theyre","its","ok","okay","pls","please","thx","thanks","regards","best",
+    "hi","hello","dear","re","fw","fwd","via","kind"
+}
+CTA_STOP = {
+    "click","here","unsubscribe","view","browser","mailto","reply","iphone","android",
+    "press","link","below","above","update","newsletter","manage","preferences",
+    "לחץ","כאן","נשלח","מה","מה-iphone","הטלפון"
+}
+TECH_META = {
+    "quot","nbsp","cid","href","src","img","class","style","div","span","http","https",
+    "content","content-type","multipart","alternative","quoted","printable","utf",
+    "windows-1255","iso-8859","us-ascii","attachment","filename"
+}
+ZH_HEADER_STOP = {"发送时间","星期","星期一","星期二","星期三","星期四","星期五","星期六","星期日","转发","主题","收件人","发件人"}
+HE_EXTRA_STOP = {"עם","או"}
+# fold into STOP_TERMS and build a vectorizer stoplist
+STOP_TERMS |= AUX_STOP | CTA_STOP | TECH_META | ZH_HEADER_STOP | HE_EXTRA_STOP
 EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
 YEAR_RE = re.compile(r"^(19|20)\d{2}$")
 NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
 ONE_CHAR_RE = re.compile(r"^.$")
+# This stoplist is used by the CountVectorizer
+STOPWORD_FOR_VEC = EN_STOP | HE_STOP | STOP_TERMS
 def _is_junk_term(t: str) -> bool:
     tl = t.lower()
+    if tl in STOP_TERMS or tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
         return True
     if EMAIL_LIKE_RE.search(tl): return True
     if YEAR_RE.match(tl): return True
             cut = idx if (cut is None or idx < cut) else cut
     if cut is not None:
         text = text[:cut]
+    # extra safety for mobile signatures that sneak through
+    text = re.sub(r"\n\s*sent from my .*?$", "", text, flags=re.I|re.M)
+    text = re.sub(r"\n\s*(נשלח מה-?iphone).*?$", "", text, flags=re.I|re.M)
     return text.strip()
 def parse_name_email(s: str) -> Tuple[str, str]:
     tokens.append(lang_tok)
     return (t + " " + " ".join(tokens)).strip()
+# =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST ===================
+def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alpha=0.75):
+    """
+    Create human-readable labels per cluster using:
+      1) PMI bigrams (cluster vs global)   + subject coverage boost
+      2) Class-TFIDF unigrams (cluster vs rest) + subject coverage boost
+    `subjects`: list of subject strings aligned with `texts`
+    `subject_alpha`: weight added per token = alpha * coverage_in_subjects (0..1)
+    """
     import math as _math
     from collections import Counter, defaultdict
     from sklearn.feature_extraction.text import TfidfVectorizer
         if tl in STOP_TERMS: return True             # extra HTML/MIME junk
         if tl in HEADER_STOP: return True
         if "@" in tl: return True
+        # drop short ASCII like "eb/ys/yl"
+        if tl.isascii() and len(tl) <= 2: return True
+        # punctuation blobs (keep apostrophes)
         if re.search(r"[^\w\-']", tl):
             if "’" not in tl and "'" not in tl:
                 return True
     glob_bg = Counter()
     per_c_bg = defaultdict(Counter)
     per_c_texts = defaultdict(list)
+    per_c_doc_count = defaultdict(int)
+    # SUBJECT presence (unique tokens/bigrams per subject per doc)
+    per_c_subj_uni_docs = defaultdict(Counter)
+    per_c_subj_bg_docs  = defaultdict(Counter)
+    have_subjects = subjects is not None and len(subjects) == len(texts)
+    for idx, (txt, c) in enumerate(zip(texts, labels)):
+        c = int(c)
         toks = tokenize_clean(txt)
         bgs = set(bigrams(toks))
         glob_bg.update(bgs)
+        per_c_bg[c].update(bgs)
+        per_c_texts[c].append(" ".join(toks))
+        per_c_doc_count[c] += 1
+        if have_subjects:
+            subj_toks = tokenize_clean(subjects[idx] or "")
+            subj_uni_set = set(subj_toks)
+            subj_bg_set  = set(bigrams(subj_toks))
+            per_c_subj_uni_docs[c].update(subj_uni_set)
+            per_c_subj_bg_docs[c].update(subj_bg_set)
     labels_out = {}
     total_bg = sum(glob_bg.values()) + 1e-12
     for c in sorted(set(int(x) for x in labels)):
+        n_docs_c = max(1, per_c_doc_count[c])
+        # PMI bigrams (+ subject boost)
         scores = []
         total_c = sum(per_c_bg[c].values()) + 1e-12
         for bg, cnt in per_c_bg[c].most_common(2000):
             p_bg   = (glob_bg[bg] / total_bg)
             if p_bg > 0 and p_bg_c > 0:
                 score = _math.log(p_bg_c) - _math.log(p_bg)
+                # subject coverage boost: fraction of cluster docs whose SUBJECT contains this bigram
+                cov = 0.0
+                if have_subjects:
+                    cov = per_c_subj_bg_docs[c][bg] / n_docs_c
+                score = score + subject_alpha * cov
                 scores.append((score, bg))
         scores.sort(reverse=True)
         top_bi = [bg for _, bg in scores[: max(2, topn//2) ]]
+        # class-TFIDF unigrams (cluster doc vs. background doc) + subject boost
         docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
         docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k!=c), [])) or " "]
         corpus = [docs_c[0], docs_bg[0]]
         X = vec.fit_transform(corpus)
         vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
         row = X[0].toarray().ravel()
+        # Build subject coverage vector over this vocab
+        subj_cov = np.zeros_like(row)
+        if have_subjects:
+            vocab_index = {t:i for i,t in enumerate(vocab)}
+            for tok, cnt_docs in per_c_subj_uni_docs[c].items():
+                if tok in vocab_index:
+                    subj_cov[vocab_index[tok]] = cnt_docs / n_docs_c  # 0..1
+        # Apply boost (only to non-junk tokens)
+        row_boosted = row.copy()
+        for i, tok in enumerate(vocab):
+            if subj_cov[i] > 0 and not is_junk_token(tok):
+                row_boosted[i] = row[i] + subject_alpha * float(subj_cov[i])
+        top_idx = row_boosted.argsort()[::-1][: max(0, topn - len(top_bi)) ]
         top_uni = []
         for i in top_idx:
             tok = vocab[i]
         while parent[a]!=a: a=parent[a]
         return a
     for i in range(k):
+        for j in range(i+1, j := k):
             if sim[i,j] >= thresh:
                 pi, pj = find(i), find(j)
                 if pi!=pj: parent[pj]=pi
         # Enriched texts (adds __HAS_*__ flags + __LANG__)
         texts = list(df_main.apply(enrich_text, axis=1))
+        subjects_only = list(df_main["subject"].fillna(""))
         # === Vectorization ===
         ngram_range = (1, 2) if use_bigrams else (1, 1)
             token_pattern=TOKEN_PATTERN,
             lowercase=True,
             dtype=np.float32,
+            stop_words=STOPWORD_FOR_VEC,   # <-- use expanded stoplist
         )
         TF = count_vec.fit_transform(texts)
         bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
         )
         X_char = char_vec.fit_transform(texts)
+        # Down-weight char-grams so they don't dominate geometry (slightly lower)
+        X_full = hstack([X_word, X_char * 0.20], format="csr")
         d_word = X_word.shape[1]
         d_char = X_char.shape[1]
         d_full = X_full.shape[1]
         # Attach clustering back to df_main
         df_main["cluster_id"] = labels
+        term_names = cluster_labels_pmi_bigram(texts, labels, subjects=subjects_only, topn=6, subject_alpha=0.75)
         df_main["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
         df_main["anomaly_score"] = anomaly_scores
         status_md = (
             f"**Processed {len(df):,} emails**  \n"
+            f"Word feats (BM25): {d_word:,}  |  Char feats: {d_char:,} (x0.20)  |  Total: {d_full:,}  \n"
             f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims  |  ' if use_lsa else ''}"
             f"k = {k}  |  Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'}  |  "
             f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
         else:
             q_emb = q_vec_full
+        # align with df_main order (exclude -1 and -2)
         mask = ~df["cluster_id"].isin([-1, -2])
         filtered_df = df[mask]