Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Sep 1, 2025

Commit

186377a

verified ·

1 Parent(s): abfac2f

Update app.py

Browse files

Files changed (1) hide show

app.py +212 -206

app.py CHANGED Viewed

@@ -64,6 +64,74 @@ try:
 except Exception:
     VADER_OK = False
 # =================== Regex & Flags ===================
 TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
 URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
@@ -1082,6 +1150,50 @@ def corruption_score(row, trusted_domains: set):
         score += 0.3
     return score
 # =================== Gradio UI ===================
 CSS = """
 :root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
@@ -1145,6 +1257,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             embeddings_path = gr.Textbox(label="Path to local embeddings (.txt/.vec/.bin) (optional)", value="")
             embeddings_binary = gr.Checkbox(label="File is binary word2vec format", value=False)
         with gr.Row():
             cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
             domain_drop  = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
             sender_drop  = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
@@ -1154,7 +1267,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         with gr.Row():
             date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
             date_end   = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
-            sort_by    = gr.Dropdown(label="Sort by", choices=["corruption_score","date","anomaly_score","search_score"], value="corruption_score")
             sort_dir   = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
         # NEW: hide noise toggle
         hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
@@ -1227,6 +1340,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     def _apply_filters(
         df: pd.DataFrame,
         cluster: Optional[str],
         domain: Optional[str],
         sender: Optional[str],
@@ -1235,11 +1349,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         tag_value: str,
         start: str,
         end: str,
-        hide_noise_flag: bool = False,  # NEW
     ) -> pd.DataFrame:
         out = df
         if cluster and cluster != "(any)":
-            m = re.match(r"^(\-?\d+)\s+—", cluster)
             if m:
                 cid = int(m.group(1))
                 out = out[out["cluster_id"] == cid]
@@ -1298,7 +1415,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         if inbox_file is None:
             return ("**Please upload a file.**",
                     None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
-                    None, None, None, None)
         # === Vectorization & Clustering (UPGRADED) ===
         def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
@@ -1484,7 +1601,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         if not recs:
             return ("**No valid records found.**",
                     None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
-                    None, None, None, None)
         normd = []
         for r in tqdm(recs, desc="Normalize", leave=False):
@@ -1495,12 +1612,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         if df.empty:
             return ("**No usable email records.**",
                     None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
-                    None, None, None, None)
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
         df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
         df = compute_sentiment_column(df)
         flags = []
         for _, row in df.iterrows():
             f = []
@@ -1514,12 +1638,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             flags.append(f)
         df["flags"] = flags
-        df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_domain", "")), axis=1)
-        df["is_notify"] = df.apply(lambda r: is_notification_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_email", ""), r.get("from_domain", "")), axis=1)
-        df_main = df[~(df["is_news"] | df["is_notify"])].reset_index(drop=True)
-        df_news = df[df["is_news"]].reset_index(drop=True)
-        df_alerts = df[df["is_notify"]].reset_index(drop=True)
         kv = None
         emb_dim = 0
@@ -1527,20 +1648,23 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
         parts = []
-        if bool(per_language) and "lang" in df_main.columns:
-            for lang_code, grp in df_main.groupby("lang", dropna=False):
-                parts.append((lang_code, grp.copy()))
         else:
-            parts = [("all", df_main.copy())]
         labels_list, cluster_name_list, anomaly_list = [], [], []
         X_reduced_holder = None
         term_names_global = {}
         single_partition = (len(parts) == 1)
         d_word_agg, d_char_agg, k_agg = 0, 0, 0
         svd_obj_local, norm_obj_local = None, None
-        for p_lang, df_part in parts:
             if df_part.empty:
                 continue
             texts, subjects_only = _make_texts(df_part)
@@ -1589,38 +1713,29 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 d_word=d_word,
                 d_char=d_char,
             )
-            # NEW: stabilize per partition
-            labels = stabilize_labels(
-                X_space, labels,
-                min_size=40,
-                merge_thresh=0.96,
-                reassign_thresh=0.35,
-            )
             k_agg += len(set(labels))
             term_names = cluster_labels_pmi_bigram(
                 texts=texts, labels=labels, subjects=subjects_only,
                 topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
             )
-            term_names_global.update({int(k): v for k, v in term_names.items()})
             labels_list.append(pd.Series(labels, index=df_part.index))
-            cluster_name_list.append(
-                pd.Series(
-                    [term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels],
-                    index=df_part.index,
-                )
-            )
             anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
             if single_partition and X_reduced is not None:
                 X_reduced_holder = X_reduced
         if labels_list:
-            df_main["cluster_id"] = pd.concat(labels_list).sort_index()
-            df_main["cluster_name"] = pd.concat(cluster_name_list).sort_index()
-            df_main["anomaly_score"] = pd.concat(anomaly_list).sort_index()
         else:
             df_main["cluster_id"] = -10
             df_main["cluster_name"] = "unclustered"
@@ -1637,6 +1752,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
         df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
         index_obj = None
         use_faiss_flag = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
@@ -1655,16 +1771,18 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 pass
         cluster_counts = (
-            df.groupby(["cluster_id", "cluster_name"])
-            .size()
-            .reset_index(name="count")
-            .sort_values("count", ascending=False)
-            .head(500)
         )
         cluster_counts["label"] = cluster_counts.apply(
-            lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
         )
         cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
         domain_counts = (
             df.groupby("from_domain").size().reset_index(name="count").sort_values("count", ascending=False).head(100)
         )
@@ -1682,7 +1800,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             .sort_values("corruption_score", ascending=False)
             .head(200)
         )
-        out_table = _sort_results(df, "corruption_score", "desc")
         vec_state = {
             "use_hashing": bool(use_hashing),
@@ -1698,85 +1816,35 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         norm_obj_out = norm_obj_local if single_partition else None
         return (
-            status_md,
-            cluster_counts,
-            domain_counts,
-            sender_counts,
-            actors,
-            offhours_table,
-            out_table,
-            df,
-            vec_state,
-            X_reduced_holder,
-            index_obj,
-            term_names_global,
-            bool(use_lsa),
-            use_faiss_flag,
             gr.update(choices=cluster_choices, value="(any)"),
-            gr.update(choices=domain_choices, value="(any)"),
-            gr.update(choices=sender_choices, value="(any)"),
             gr.update(choices=lang_choices, value="(any)"),
-            svd_obj_out,
-            norm_obj_out,
-            (d_word_agg, d_char_agg),
-            extra_terms_lower,
-            bool(highlight_toggle),
         )
     (run_btn.click)(
         process_file,
         inputs=[
-            inbox_file,
-            max_features,
-            min_df,
-            max_df,
-            use_bigrams,
-            skip_lang,
-            use_lsa,
-            lsa_dim,
-            auto_k,
-            k_clusters,
-            mb_batch,
-            use_faiss,
-            use_iso,
-            trusted_domains_in,
-            extra_keywords_in,
-            highlight_toggle,
-            use_hashing,
-            hash_bits,
-            use_hdbscan,
-            hdb_min_cluster,
-            hdb_min_samples,
-            per_language,
-            use_embeddings,
-            embed_weight,
-            embeddings_path,
-            embeddings_binary,
         ],
         outputs=[
-            status,
-            cluster_counts_df,
-            domain_counts_df,
-            sender_counts_df,
-            actors_df,
-            offhours_df,
-            results_df,
-            state_df,
-            state_vec,
-            state_X_reduced,
-            state_index,
-            state_term_names,
-            state_use_lsa,
-            state_use_faiss,
-            cluster_drop,
-            domain_drop,
-            sender_drop,
-            lang_drop,
-            state_svd,
-            state_norm,
-            state_dims,
-            state_extra_terms,
-            state_highlight,
         ],
     )
@@ -1789,85 +1857,56 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
         else:
             tmp["_dt"] = pd.NaT
-        by = by if by in tmp.columns else "corruption_score"
         asc = (direction == "asc")
         sort_cols = [by]
         if by == "date":
             sort_cols = ["_dt"]
-        elif by in ["anomaly_score", "corruption_score"]:
             sort_cols.append("_dt")
         tmp = tmp.sort_values(sort_cols, ascending=[asc, False])
         cols_out = [
-            "date",
-            "from_email",
-            "from_domain",
-            "subject",
-            "cluster_name",
-            "lang",
-            "tags",
-            "flags",
-            "sentiment",
-            "corruption_score",
-            "anomaly_score",
         ]
         if "search_score" in tmp.columns:
             cols_out.append("search_score")
         return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
-    def refresh_results(df, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
         if df is None or len(df) == 0:
             return pd.DataFrame()
         filt = _apply_filters(
-            df, cluster, domain, sender, lang, sentiment, tag, start, end, hide_noise_flag=bool(hide_noise_flag)
         )
         return _sort_results(filt, sort_by, sort_dir)
     # Re-run when any filter control changes (including hide_noise)
-    for ctrl in [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
                  date_start, date_end, sort_by, sort_dir, hide_noise]:
         ctrl.change(
             refresh_results,
             inputs=[
-                state_df,
-                cluster_drop,
-                domain_drop,
-                sender_drop,
-                lang_drop,
-                sentiment_drop,
-                tag_drop,
-                date_start,
-                date_end,
-                sort_by,
-                sort_dir,
-                hide_noise,
             ],
             outputs=[results_df],
         )
-    # Reset filters (sets selects to (any), dates blank, sort default, and hide_noise= True)
     reset_btn.click(
-        lambda: ["(any)"] * 6 + [""] * 2 + ["corruption_score", "desc"] + [True],
         [],
-        [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
          date_start, date_end, sort_by, sort_dir, hide_noise],
     ).then(
         refresh_results,
         inputs=[
-            state_df,
-            cluster_drop,
-            domain_drop,
-            sender_drop,
-            lang_drop,
-            sentiment_drop,
-            tag_drop,
-            date_start,
-            date_end,
-            sort_by,
-            sort_dir,
-            hide_noise,
         ],
         outputs=[results_df],
     )
@@ -1964,17 +2003,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     search_btn.click(
         search_fn,
         inputs=[
-            search_query,
-            state_df,
-            state_vec,
-            state_X_reduced,
-            state_index,
-            state_use_lsa,
-            state_use_faiss,
-            state_svd,
-            state_norm,
-            sort_by,
-            sort_dir,
         ],
         outputs=[results_df, state_query_terms],
     )
@@ -2003,7 +2033,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         return build_highlighted_html(
             row,
             query_terms=q_terms,
-            cluster_label=clabel,
             do_highlight=do_highlight,
             extra_terms=extra_terms,
         )
@@ -2020,25 +2050,21 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             return gr.update()
         val = df_sum.iloc[evt.index[0]][col_name]
         return gr.update(value=val)
-    # Cluster summary → set cluster filter
     cluster_counts_df.select(
-        lambda evt, df: on_click_filter(evt, df, "label", cluster_drop), [cluster_counts_df], [cluster_drop]
     ).then(
         refresh_results,
         inputs=[
-            state_df,
-            cluster_drop,
-            domain_drop,
-            sender_drop,
-            lang_drop,
-            sentiment_drop,
-            tag_drop,
-            date_start,
-            date_end,
-            sort_by,
-            sort_dir,
-            hide_noise,
         ],
         outputs=[results_df],
     )
@@ -2049,18 +2075,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     ).then(
         refresh_results,
         inputs=[
-            state_df,
-            cluster_drop,
-            domain_drop,
-            sender_drop,
-            lang_drop,
-            sentiment_drop,
-            tag_drop,
-            date_start,
-            date_end,
-            sort_by,
-            sort_dir,
-            hide_noise,
         ],
         outputs=[results_df],
     )
@@ -2071,18 +2087,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     ).then(
         refresh_results,
         inputs=[
-            state_df,
-            cluster_drop,
-            domain_drop,
-            sender_drop,
-            lang_drop,
-            sentiment_drop,
-            tag_drop,
-            date_start,
-            date_end,
-            sort_by,
-            sort_dir,
-            hide_noise,
         ],
         outputs=[results_df],
     )

 except Exception:
     VADER_OK = False
+# ======== STAGE-1 TAXONOMY (Buckets) ========
+TAXONOMY = {
+    "Lobbyist": ["lobby","lobbyist","pac","influence"],
+    "Campaign Finance": ["donation","contribution","fundraiser","pledge","campaign finance","pac"],
+    "Procurement": ["contract","tender","rfp","rfq","bid","invoice","vendor","purchase order","po"],
+    "HR/Admin": ["hiring","personnel","payroll","benefits","policy","vacation","pto"],
+    "Constituent": ["constituent","concerned citizen","my issue","complaint","community"],
+    "Scheduling": ["schedule","meeting","appointment","calendar","invite","availability","reschedule"],
+    "Legal": ["legal","lawsuit","attorney","counsel","privileged","court","subpoena","confidential"],
+    "IT/Security": ["password","account security","two-factor","2fa","vpn","verification code","security alert","it support"],
+    "Newsletters/Alerts": ["newsletter","daily briefing","news update","unsubscribe","press clip","digest"],
+    "Other": [],
+}
+# header/domain cues (expand as you learn)
+LOBBY_DOMAINS = set()   # e.g., {"acme-lobby.com"}
+LEGAL_DOMAINS = set()   # e.g., {"biglaw.com","firmlaw.com"}
+def _contains_any(text: str, terms: list[str]) -> bool:
+    if not text or not terms: return False
+    tl = text.lower()
+    return any(t for t in terms if t and t.lower() in tl)
+def _bucket_header_bonus(row: pd.Series, bucket: str) -> float:
+    fd = (row.get("from_domain") or "").lower()
+    subj = (row.get("subject") or "")
+    if bucket == "Newsletters/Alerts":
+        return 5.0 if is_news_like(subj, row.get("body_text",""), fd) else 0.0
+    if bucket == "IT/Security":
+        return 5.0 if is_notification_like(subj, row.get("body_text",""), row.get("from_email",""), fd) else 0.0
+    if bucket == "Constituent":
+        # personal mail to public office is a strong hint
+        return 3.0 if (fd in PERSONAL_DOMAINS) else 0.0
+    if bucket == "Lobbyist":
+        return 5.0 if fd in LOBBY_DOMAINS else 0.0
+    if bucket == "Legal":
+        return 5.0 if (("law" in fd) or (fd in LEGAL_DOMAINS) or ("privileged" in subj.lower())) else 0.0
+    if bucket == "Scheduling":
+        # ICS invite or explicit invite subject
+        body = (row.get("body_text") or "")
+        return 3.0 if (ATTACH_NAME_RE.search(" ".join(row.get("attachments") or [])) or re.search(r"\binvitation\b|\binvite\b", subj, re.I) or re.search(r"\.ics\b", body, re.I)) else 0.0
+    return 0.0
+MIN_ROUTE_SCORE = 1.5   # at least ~2 weak signals or one strong
+TIE_MARGIN      = 1.0
+def route_email_row(row: pd.Series) -> str:
+    text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
+    scores: dict[str,float] = {b: 0.0 for b in TAXONOMY.keys()}
+    # lexicon points
+    for b, terms in TAXONOMY.items():
+        if not terms:
+            continue
+        # count unique term hits to avoid over-crediting repeats
+        hits = sum(1 for t in terms if t and t.lower() in text)
+        scores[b] += float(hits)
+        # strong phrases in your corruption lexicon can hint Lobbyist/Procurement
+        if b in ("Lobbyist","Procurement") and any(p in text for p in SUSPECT_PHRASES):
+            scores[b] += 1.0
+    # header bonuses
+    for b in TAXONOMY.keys():
+        scores[b] += _bucket_header_bonus(row, b)
+    # choose
+    best_bucket, best = max(scores.items(), key=lambda kv: kv[1])
+    second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
+    if best < MIN_ROUTE_SCORE or (best - second) < TIE_MARGIN:
+        return "Other"
+    return best_bucket
 # =================== Regex & Flags ===================
 TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
 URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
         score += 0.3
     return score
+def compute_context_anomaly(df_in: pd.DataFrame) -> pd.DataFrame:
+    if df_in.empty:
+        df_in["context_anomaly_score"] = 0.0
+        return df_in
+    # 1) IsolationForest percentile -> 0–6 (you already computed anomaly_score per partition; lower is “more anomalous” if using score_samples with sign reversed above)
+    df = df_in.copy()
+    if "anomaly_score" in df.columns:
+        # higher = more anomalous in your current pipeline (you negated score_samples). Convert to percentile per bucket.
+        df["_if_pct"] = 0.0
+        for bkt, grp in df.groupby("bucket", dropna=False):
+            vals = grp["anomaly_score"].astype(float)
+            if vals.notna().sum() >= 5:
+                ranks = vals.rank(pct=True, ascending=False)  # top anomaly gets 1.0
+                df.loc[grp.index, "_if_pct"] = ranks.clip(0, 1)
+        df["_if_pts"] = (df["_if_pct"] * 6.0).clip(0, 6)
+    else:
+        df["_if_pts"] = 0.0
+    # 2) Rule violations per bucket (0–2)
+    df["_rule_pts"] = 0.0
+    low = (df["subject"].fillna("") + " " + df["body_text"].fillna("")).str.lower()
+    for bkt, terms in TAXONOMY.items():
+        mask = (df["bucket"] == bkt)
+        if not mask.any():
+            continue
+        if terms:
+            has_term = low.str.contains("|".join([re.escape(t.lower()) for t in terms]), regex=True)
+            df.loc[mask & (~has_term), "_rule_pts"] += 1.0
+        # header expectation examples:
+        if bkt == "Constituent":
+            df.loc[mask & (~df["from_domain"].str.lower().isin(PERSONAL_DOMAINS)), "_rule_pts"] += 1.0
+        if bkt == "Scheduling":
+            subj = df.loc[mask, "subject"].fillna("").str.lower()
+            df.loc[mask & (~subj.str.contains(r"\bmeeting|invite|schedule|calendar\b", regex=True)), "_rule_pts"] += 1.0
+    df["_rule_pts"] = df["_rule_pts"].clip(0, 2)
+    # 3) Corruption heuristics capped to 0–3
+    df["_corr_pts"] = df["corruption_score"].fillna(0).clip(0, 3)
+    df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
+    return df.drop(columns=["_if_pct","_if_pts","_rule_pts","_corr_pts"], errors="ignore")
 # =================== Gradio UI ===================
 CSS = """
 :root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
             embeddings_path = gr.Textbox(label="Path to local embeddings (.txt/.vec/.bin) (optional)", value="")
             embeddings_binary = gr.Checkbox(label="File is binary word2vec format", value=False)
         with gr.Row():
+            bucket_drop = gr.Dropdown(label="Bucket", choices=["(any)"] + list(TAXONOMY.keys()), value="(any)", allow_custom_value=False)
             cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
             domain_drop  = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
             sender_drop  = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
         with gr.Row():
             date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
             date_end   = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
+            sort_by    = gr.Dropdown(label="Sort by", choices=["context_anomaly_score","corruption_score","date","anomaly_score","search_score"], value="context_anomaly_score")
             sort_dir   = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
         # NEW: hide noise toggle
         hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
     def _apply_filters(
         df: pd.DataFrame,
+        bucket: Optional[str],
         cluster: Optional[str],
         domain: Optional[str],
         sender: Optional[str],
         tag_value: str,
         start: str,
         end: str,
+        hide_noise_flag: bool = False,
     ) -> pd.DataFrame:
         out = df
+        if bucket and bucket != "(any)":
+            out = out[out["bucket"] == bucket]
         if cluster and cluster != "(any)":
+            # Modified to parse the new cluster label format
+            m = re.match(r"^.*?(\-?\d+)\s+—", cluster)
             if m:
                 cid = int(m.group(1))
                 out = out[out["cluster_id"] == cid]
         if inbox_file is None:
             return ("**Please upload a file.**",
                     None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
+                    None, None, None, None, None)
         # === Vectorization & Clustering (UPGRADED) ===
         def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
         if not recs:
             return ("**No valid records found.**",
                     None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
+                    None, None, None, None, None)
         normd = []
         for r in tqdm(recs, desc="Normalize", leave=False):
         if df.empty:
             return ("**No usable email records.**",
                     None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
+                    None, None, None, None, None)
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
         df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
         df = compute_sentiment_column(df)
+        # >>> NEW: stage-1 routing
+        df["bucket"] = df.apply(route_email_row, axis=1)
+        df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_domain", "")), axis=1)
+        df["is_notify"] = df.apply(lambda r: is_notification_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_email", ""), r.get("from_domain", "")), axis=1)
+        df.loc[df["is_news"] == True, "bucket"]   = "Newsletters/Alerts"
+        df.loc[df["is_notify"] == True, "bucket"] = "IT/Security"
         flags = []
         for _, row in df.iterrows():
             f = []
             flags.append(f)
         df["flags"] = flags
+        df_main = df[~df["bucket"].isin(["Newsletters/Alerts", "IT/Security"])].reset_index(drop=True)
+        df_news = df[df["bucket"] == "Newsletters/Alerts"].reset_index(drop=True)
+        df_alerts = df[df["bucket"] == "IT/Security"].reset_index(drop=True)
         kv = None
         emb_dim = 0
             kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
         parts = []
+        if bool(per_language):
+            for bkt, g_bucket in df_main.groupby("bucket", dropna=False):
+                for lang_code, grp in g_bucket.groupby("lang", dropna=False):
+                    parts.append(((bkt, lang_code), grp.copy()))
         else:
+            for bkt, grp in df_main.groupby("bucket", dropna=False):
+                parts.append(((bkt, "all"), grp.copy()))
         labels_list, cluster_name_list, anomaly_list = [], [], []
+        bucket_indexers = []  # keep index locations to reassign later
         X_reduced_holder = None
         term_names_global = {}
         single_partition = (len(parts) == 1)
         d_word_agg, d_char_agg, k_agg = 0, 0, 0
         svd_obj_local, norm_obj_local = None, None
+        for (bucket_name, _lang), df_part in parts:
             if df_part.empty:
                 continue
             texts, subjects_only = _make_texts(df_part)
                 d_word=d_word,
                 d_char=d_char,
             )
+            labels = stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35)
             k_agg += len(set(labels))
             term_names = cluster_labels_pmi_bigram(
                 texts=texts, labels=labels, subjects=subjects_only,
                 topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
             )
+            # collect to write back in one go
+            bucket_indexers.append(df_part.index)
             labels_list.append(pd.Series(labels, index=df_part.index))
+            cluster_name_list.append(pd.Series([term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels], index=df_part.index))
             anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
+            # remember term names for the reader pill
+            term_names_global.update({int(k): v for k, v in term_names.items()})
             if single_partition and X_reduced is not None:
                 X_reduced_holder = X_reduced
         if labels_list:
+            df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_id"]   = pd.concat(labels_list).sort_index()
+            df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_name"] = pd.concat(cluster_name_list).sort_index()
+            df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "anomaly_score"] = pd.concat(anomaly_list).sort_index()
         else:
             df_main["cluster_id"] = -10
             df_main["cluster_name"] = "unclustered"
         df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
         df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
+        df = compute_context_anomaly(df)
         index_obj = None
         use_faiss_flag = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
                 pass
         cluster_counts = (
+            df.groupby(["bucket", "cluster_id", "cluster_name"])
+              .size()
+              .reset_index(name="count")
+              .sort_values("count", ascending=False)
+              .head(500)
         )
         cluster_counts["label"] = cluster_counts.apply(
+            lambda r: f'{r["bucket"]} — {int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
         )
         cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
+        bucket_choices = ["(any)"] + sorted(df["bucket"].dropna().unique().tolist())
         domain_counts = (
             df.groupby("from_domain").size().reset_index(name="count").sort_values("count", ascending=False).head(100)
         )
             .sort_values("corruption_score", ascending=False)
             .head(200)
         )
+        out_table = _sort_results(df, "context_anomaly_score", "desc")
         vec_state = {
             "use_hashing": bool(use_hashing),
         norm_obj_out = norm_obj_local if single_partition else None
         return (
+            status_md, cluster_counts, domain_counts, sender_counts, actors, offhours_table,
+            out_table, df, vec_state, X_reduced_holder, index_obj, term_names_global,
+            bool(use_lsa), use_faiss_flag,
             gr.update(choices=cluster_choices, value="(any)"),
+            gr.update(choices=domain_choices,  value="(any)"),
+            gr.update(choices=sender_choices,  value="(any)"),
             gr.update(choices=lang_choices, value="(any)"),
+            svd_obj_out, norm_obj_out, (d_word_agg, d_char_agg),
+            extra_terms_lower, bool(highlight_toggle),
+            gr.update(choices=bucket_choices, value="(any)")
         )
     (run_btn.click)(
         process_file,
         inputs=[
+            inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
+            use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
+            trusted_domains_in, extra_keywords_in, highlight_toggle,
+            use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
+            per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
         ],
         outputs=[
+            status, cluster_counts_df, domain_counts_df, sender_counts_df,
+            actors_df, offhours_df, results_df,
+            state_df, state_vec, state_X_reduced, state_index, state_term_names,
+            state_use_lsa, state_use_faiss,
+            cluster_drop, domain_drop, sender_drop, lang_drop,
+            state_svd, state_norm, state_dims, state_extra_terms, state_highlight,
+            bucket_drop,
         ],
     )
             tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
         else:
             tmp["_dt"] = pd.NaT
+        by = by if by in tmp.columns else "context_anomaly_score"
         asc = (direction == "asc")
         sort_cols = [by]
         if by == "date":
             sort_cols = ["_dt"]
+        elif by in ["anomaly_score", "corruption_score", "context_anomaly_score"]:
             sort_cols.append("_dt")
         tmp = tmp.sort_values(sort_cols, ascending=[asc, False])
         cols_out = [
+            "date","bucket","from_email","from_domain","subject","cluster_name","lang",
+            "tags","flags","sentiment","context_anomaly_score","corruption_score","anomaly_score"
         ]
         if "search_score" in tmp.columns:
             cols_out.append("search_score")
         return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
+    def refresh_results(df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
         if df is None or len(df) == 0:
             return pd.DataFrame()
         filt = _apply_filters(
+            df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, hide_noise_flag=bool(hide_noise_flag)
         )
         return _sort_results(filt, sort_by, sort_dir)
     # Re-run when any filter control changes (including hide_noise)
+    for ctrl in [bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
                  date_start, date_end, sort_by, sort_dir, hide_noise]:
         ctrl.change(
             refresh_results,
             inputs=[
+                state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
+                sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
             ],
             outputs=[results_df],
         )
+    # Reset filters
     reset_btn.click(
+        lambda: ["(any)"] * 7 + [""] * 2 + ["context_anomaly_score", "desc"] + [True],
         [],
+        [bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
          date_start, date_end, sort_by, sort_dir, hide_noise],
     ).then(
         refresh_results,
         inputs=[
+            state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
+            sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise,
         ],
         outputs=[results_df],
     )
     search_btn.click(
         search_fn,
         inputs=[
+            search_query, state_df, state_vec, state_X_reduced, state_index,
+            state_use_lsa, state_use_faiss, state_svd, state_norm, sort_by, sort_dir,
         ],
         outputs=[results_df, state_query_terms],
     )
         return build_highlighted_html(
             row,
             query_terms=q_terms,
+            cluster_label=f'{row.get("bucket","Other")} / {clabel}',
             do_highlight=do_highlight,
             extra_terms=extra_terms,
         )
             return gr.update()
         val = df_sum.iloc[evt.index[0]][col_name]
         return gr.update(value=val)
+    def on_cluster_summary_select(evt: gr.SelectData, df_sum: pd.DataFrame):
+        if evt.index is None or df_sum is None or df_sum.empty:
+            return gr.update(), gr.update()
+        r = df_sum.iloc[evt.index[0]]
+        return gr.update(value=r["bucket"]), gr.update(value=r["label"])
+    # Cluster summary → set bucket & cluster filter
     cluster_counts_df.select(
+        on_cluster_summary_select, [cluster_counts_df], [bucket_drop, cluster_drop]
     ).then(
         refresh_results,
         inputs=[
+            state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
+            sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
         ],
         outputs=[results_df],
     )
     ).then(
         refresh_results,
         inputs=[
+            state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
+            sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
         ],
         outputs=[results_df],
     )
     ).then(
         refresh_results,
         inputs=[
+            state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
+            sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
         ],
         outputs=[results_df],
     )