Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Sep 1, 2025

Commit

f97dfc3

verified ·

1 Parent(s): 042f4b6

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -44

app.py CHANGED Viewed

@@ -103,6 +103,17 @@ OFFCHANNEL_RE = re.compile("|".join(OFFCHANNEL_PATTERNS), re.I)
 # Common personal mail domains (used with user-specified trusted org domains)
 PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
 # Optional seeded themes for semi-supervised init (used only when LSA is ON)
 CORR_LEX = {
     "kickback"      : ["kickback","bribe","under the table","gift","cash"],
@@ -111,7 +122,7 @@ CORR_LEX = {
     "money_flow"    : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
 }
-# =================== Label cleanup helpers (unchanged core) ===================
 EN_STOP = {
     "the","of","and","to","in","is","for","on","at","with","from","by","or","as",
     "that","this","it","be","are","was","were","an","a","you","your","we","our","us",
@@ -522,37 +533,79 @@ def enrich_text(row: pd.Series) -> str:
     if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
     if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
     if OFFCHANNEL_RE.search(t): tokens.append("__OFF_CHANNEL__")
     return (t + " " + " ".join(tokens)).strip()
-# =================== Cluster labeling: PMI bigrams ===================
 def cluster_labels_pmi_bigram(texts, labels, topn=6):
-    def bigrams(t):
-        toks = re.findall(TOKEN_PATTERN, t.lower())
-        return [" ".join(p) for p in zip(toks, toks[1:])]
-    N = len(texts)
-    from collections import Counter
     import math as _math
     glob_bg = Counter()
-    per_c   = {int(c): Counter() for c in np.unique(labels)}
-    for t, c in zip(texts, labels):
-        bgs = set(bigrams(t))
         glob_bg.update(bgs)
-        per_c[int(c)].update(bgs)
     labels_out = {}
-    total_bg = sum(glob_bg.values()) + 1e-9
-    for c in np.unique(labels):
-        c = int(c)
         scores = []
-        total_c = sum(per_c[c].values()) + 1e-9
-        for bg, cnt in per_c[c].most_common(1000):
             p_bg_c = cnt / total_c
             p_bg   = (glob_bg[bg] / total_bg)
             if p_bg > 0 and p_bg_c > 0:
                 score = _math.log(p_bg_c) - _math.log(p_bg)
                 scores.append((score, bg))
         scores.sort(reverse=True)
-        top = [bg for _, bg in scores[:topn]]
-        labels_out[c] = ", ".join(top) if top else f"cluster_{c}"
     return labels_out
 # =================== Auto-k & merge ===================
@@ -707,13 +760,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     with gr.Accordion("Vectorization & Clustering", open=True):
         with gr.Row():
             max_features = gr.Number(label="Word max_features (BM25)", value=120_000, precision=0)
-            min_df = gr.Number(label="min_df (doc freq ≥)", value=2, precision=0)
             max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
             use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
             skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
         with gr.Row():
             use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
-            lsa_dim = gr.Number(label="LSA components", value=150, precision=0)
             auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
             k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
             mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
@@ -878,7 +931,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         # trusted org domains
         trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
         extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
-        # extend SUSPECT_PHRASES runtime (no mutation of constant list)
         extra_terms_lower = [t.lower() for t in extra_terms]
         recs = _load_json_records(inbox_file.name)
@@ -921,8 +974,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             flags.append(f)
         df["flags"] = flags
-        # Enriched texts (adds __HAS_*__ flags)
-        texts = list(df.apply(enrich_text, axis=1))
         # === Vectorization ===
         ngram_range = (1, 2) if use_bigrams else (1, 1)
@@ -946,7 +1004,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         )
         X_char = char_vec.fit_transform(texts)
-        X_full = hstack([X_word, X_char], format="csr")
         d_word = X_word.shape[1]
         d_char = X_char.shape[1]
         d_full = X_full.shape[1]
@@ -965,16 +1024,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             gc.collect()
         # Optional anomaly detection (on LSA space)
-        anomaly_scores = np.full((len(df),), np.nan, dtype=np.float32)
         if use_lsa and bool(use_iso) and ISO_OK and X_reduced is not None and X_reduced.shape[0] >= 50:
             try:
                 iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
                 iso.fit(X_reduced)
-                # higher is less anomalous; convert to anomaly score = -score
-                anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32)
             except Exception:
                 pass
-        df["anomaly_score"] = anomaly_scores
         # K selection
         if bool(auto_k):
@@ -1006,20 +1063,29 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         )
         labels = kmeans.fit_predict(X_space)
-        # Merge very-similar clusters (LSA only)
         if use_lsa:
-            labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.92)
-        df["cluster_id"] = labels
-        # Cluster names
         term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
-        df["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
-        # CorruptionScore (now uses trusted domains)
         df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
-        # Build search index
         use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
         index_obj = None
         if use_faiss:
@@ -1033,7 +1099,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         # Summaries
         cluster_counts = (
-            df.groupby(["cluster_id", "cluster_name"]).size()
               .reset_index(name="count")
               .sort_values("count", ascending=False)
               .head(500)
@@ -1041,6 +1108,15 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         cluster_counts["label"] = cluster_counts.apply(
             lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
         )
         cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
         domain_counts = (
@@ -1084,7 +1160,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         status_md = (
             f"**Processed {len(df):,} emails**  \n"
-            f"Word feats (BM25): {d_word:,}  |  Char feats: {d_char:,}  |  Total: {d_full:,}  \n"
             f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims  |  ' if use_lsa else ''}"
             f"k = {k}  |  Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'}  |  "
             f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
@@ -1143,7 +1219,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         elif by == "anomaly_score" and "anomaly_score" in tmp.columns:
             tmp = tmp.sort_values(["anomaly_score","_dt"], ascending=[asc, not asc])
         else:
-            # corruption_score or search_score (if present)
             col = by if by in tmp.columns else "corruption_score"
             tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
         tmp = tmp.drop(columns=["_dt"])
@@ -1216,16 +1291,16 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             q_emb = q_vec_full
         if isinstance(index_obj, NearestNeighbors):
-            distances, indices = index_obj.kneighbors(q_emb, n_neighbors=min(50, len(df)))
             inds = indices[0]
             sims = 1.0 - distances[0]
-            results = df.iloc[inds].copy()
             results["search_score"] = sims
         elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
-            D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df)))
             inds = I[0]
             sims = D[0]
-            results = df.iloc[inds].copy()
             results["search_score"] = sims
         else:
             return pd.DataFrame(), q_terms
@@ -1238,7 +1313,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         if sort_by == "search_score":
             results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
         else:
-            # use blended but keep sort_by if chosen
             if sort_by in results.columns:
                 results = results.sort_values([sort_by,"_blend"], ascending=[(sort_dir=="asc"), False])
             else:

 # Common personal mail domains (used with user-specified trusted org domains)
 PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
+# Newsletter/newswire heuristics
+NEWS_DOMAINS = {"nytimes.com","ft.com","wsj.com","bloomberg.com","reuters.com","theguardian.com","economist.com"}
+def is_news_like(subject: str, body: str, from_domain: str) -> bool:
+    s = (subject or "").lower()
+    b = (body or "").lower()
+    fd = (from_domain or "").lower()
+    if "unsubscribe" in b or "manage preferences" in b: return True
+    if any(k in s for k in ["daily briefing","morning update","newsletter","top stories"]): return True
+    if any(d in fd for d in NEWS_DOMAINS): return True
+    return False
 # Optional seeded themes for semi-supervised init (used only when LSA is ON)
 CORR_LEX = {
     "kickback"      : ["kickback","bribe","under the table","gift","cash"],
     "money_flow"    : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
 }
+# =================== Label cleanup helpers ===================
 EN_STOP = {
     "the","of","and","to","in","is","for","on","at","with","from","by","or","as",
     "that","this","it","be","are","was","were","an","a","you","your","we","our","us",
     if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
     if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
     if OFFCHANNEL_RE.search(t): tokens.append("__OFF_CHANNEL__")
+    lang_tok = f'__LANG_{(row.get("lang") or "unk").lower()}__'
+    tokens.append(lang_tok)
     return (t + " " + " ".join(tokens)).strip()
+# =================== Cluster labeling: improved PMI + class-TFIDF ===================
 def cluster_labels_pmi_bigram(texts, labels, topn=6):
     import math as _math
+    from collections import Counter, defaultdict
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    HEADER_STOP = {"subject","re","fw","fwd","to","cc","bcc","from","sent","forwarded","回复","主题","收件人","发件人"}
+    def is_junk_token(tok: str) -> bool:
+        if _is_junk_term(tok): return True
+        tl = tok.lower()
+        if "@" in tl: return True
+        if tl in HEADER_STOP: return True
+        if re.search(r"[^\w\-']", tl):  # punctuation blobs
+            if "’" not in tl and "'" not in tl:
+                return True
+        return False
+    def tokenize_clean(t):
+        toks = re.findall(TOKEN_PATTERN, t.lower())
+        return [w for w in toks if not is_junk_token(w)]
+    def bigrams(toks):
+        return [" ".join(p) for p in zip(toks, toks[1:]) if all(not is_junk_token(x) for x in p)]
     glob_bg = Counter()
+    per_c_bg = defaultdict(Counter)
+    per_c_texts = defaultdict(list)
+    for txt, c in zip(texts, labels):
+        toks = tokenize_clean(txt)
+        bgs = set(bigrams(toks))
         glob_bg.update(bgs)
+        per_c_bg[int(c)].update(bgs)
+        per_c_texts[int(c)].append(" ".join(toks))
     labels_out = {}
+    total_bg = sum(glob_bg.values()) + 1e-12
+    for c in sorted(set(int(x) for x in labels)):
+        # PMI bigrams
         scores = []
+        total_c = sum(per_c_bg[c].values()) + 1e-12
+        for bg, cnt in per_c_bg[c].most_common(2000):
             p_bg_c = cnt / total_c
             p_bg   = (glob_bg[bg] / total_bg)
             if p_bg > 0 and p_bg_c > 0:
                 score = _math.log(p_bg_c) - _math.log(p_bg)
                 scores.append((score, bg))
         scores.sort(reverse=True)
+        top_bi = [bg for _, bg in scores[: max(2, topn//2) ]]
+        # class-TFIDF unigrams (cluster doc vs. background doc)
+        docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
+        docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k!=c), [])) or " "]
+        corpus = [docs_c[0], docs_bg[0]]
+        vec = TfidfVectorizer(
+            analyzer="word", ngram_range=(1,1),
+            max_features=3000, token_pattern=TOKEN_PATTERN, lowercase=True
+        )
+        X = vec.fit_transform(corpus)
+        vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
+        row = X[0].toarray().ravel()
+        top_idx = row.argsort()[::-1][: max(0, topn - len(top_bi)) ]
+        top_uni = [t for t in vocab[top_idx] if not is_junk_token(t)][: max(0, topn - len(top_bi)) ]
+        parts = top_bi + top_uni
+        labels_out[c] = ", ".join(parts) if parts else f"cluster_{c}"
     return labels_out
 # =================== Auto-k & merge ===================
     with gr.Accordion("Vectorization & Clustering", open=True):
         with gr.Row():
             max_features = gr.Number(label="Word max_features (BM25)", value=120_000, precision=0)
+            min_df = gr.Number(label="min_df (doc freq ≥)", value=3, precision=0)  # tightened default
             max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
             use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
             skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
         with gr.Row():
             use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
+            lsa_dim = gr.Number(label="LSA components", value=256, precision=0)  # richer default
             auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
             k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
             mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
         # trusted org domains
         trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
         extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
+        # extend phrases at runtime
         extra_terms_lower = [t.lower() for t in extra_terms]
         recs = _load_json_records(inbox_file.name)
             flags.append(f)
         df["flags"] = flags
+        # Identify news-like messages and separate them out before clustering
+        df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject",""), r.get("body_text",""), r.get("from_domain","")), axis=1)
+        df_main = df[~df["is_news"]].reset_index(drop=True)
+        df_news = df[df["is_news"]].reset_index(drop=True)
+        # Enriched texts (adds __HAS_*__ flags + __LANG__)
+        texts = list(df_main.apply(enrich_text, axis=1))
         # === Vectorization ===
         ngram_range = (1, 2) if use_bigrams else (1, 1)
         )
         X_char = char_vec.fit_transform(texts)
+        # Down-weight char-grams so they don't dominate geometry
+        X_full = hstack([X_word, X_char * 0.4], format="csr")
         d_word = X_word.shape[1]
         d_char = X_char.shape[1]
         d_full = X_full.shape[1]
             gc.collect()
         # Optional anomaly detection (on LSA space)
+        anomaly_scores = np.full((len(df_main),), np.nan, dtype=np.float32)
         if use_lsa and bool(use_iso) and ISO_OK and X_reduced is not None and X_reduced.shape[0] >= 50:
             try:
                 iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
                 iso.fit(X_reduced)
+                anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32)  # higher = more anomalous
             except Exception:
                 pass
         # K selection
         if bool(auto_k):
         )
         labels = kmeans.fit_predict(X_space)
+        # Merge very-similar clusters (LSA only) — slightly stricter
         if use_lsa:
+            labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.94)
+        # Attach clustering back to df_main
+        df_main["cluster_id"] = labels
         term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
+        df_main["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
+        df_main["anomaly_score"] = anomaly_scores
+        # Newsletter/newswire rows: assign a special cluster
+        if len(df_news):
+            df_news["cluster_id"] = -1
+            df_news["cluster_name"] = "newsletter/news"
+            df_news["anomaly_score"] = np.nan
+        # Combine back
+        df = pd.concat([df_main, df_news], ignore_index=True)
+        # CorruptionScore (uses trusted domains)
         df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
+        # Build search index on clustered subset only
         use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
         index_obj = None
         if use_faiss:
         # Summaries
         cluster_counts = (
+            df[df["cluster_id"] != -1]
+              .groupby(["cluster_id", "cluster_name"]).size()
               .reset_index(name="count")
               .sort_values("count", ascending=False)
               .head(500)
         cluster_counts["label"] = cluster_counts.apply(
             lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
         )
+        # Optionally append newsletter bucket
+        news_count = int((df["cluster_id"] == -1).sum())
+        if news_count > 0:
+            cluster_counts = pd.concat([
+                cluster_counts,
+                pd.DataFrame([{"cluster_id": -1, "cluster_name": "newsletter/news", "count": news_count,
+                               "label": f'-1 — newsletter/news ({news_count})'}])
+            ], ignore_index=True)
         cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
         domain_counts = (
         status_md = (
             f"**Processed {len(df):,} emails**  \n"
+            f"Word feats (BM25): {d_word:,}  |  Char feats: {d_char:,} (x0.4)  |  Total: {d_full:,}  \n"
             f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims  |  ' if use_lsa else ''}"
             f"k = {k}  |  Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'}  |  "
             f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
         elif by == "anomaly_score" and "anomaly_score" in tmp.columns:
             tmp = tmp.sort_values(["anomaly_score","_dt"], ascending=[asc, not asc])
         else:
             col = by if by in tmp.columns else "corruption_score"
             tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
         tmp = tmp.drop(columns=["_dt"])
             q_emb = q_vec_full
         if isinstance(index_obj, NearestNeighbors):
+            distances, indices = index_obj.kneighbors(q_emb, n_neighbors=min(50, len(df[df["cluster_id"]!=-1])))
             inds = indices[0]
             sims = 1.0 - distances[0]
+            results = df[df["cluster_id"]!=-1].iloc[inds].copy()
             results["search_score"] = sims
         elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
+            D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df[df["cluster_id"]!=-1])))
             inds = I[0]
             sims = D[0]
+            results = df[df["cluster_id"]!=-1].iloc[inds].copy()
             results["search_score"] = sims
         else:
             return pd.DataFrame(), q_terms
         if sort_by == "search_score":
             results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
         else:
             if sort_by in results.columns:
                 results = results.sort_values([sort_by,"_blend"], ascending=[(sort_dir=="asc"), False])
             else: