Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Sep 1, 2025

Commit

08fef30

verified ·

1 Parent(s): 186377a

Update app.py

Browse files

Files changed (1) hide show

app.py +431 -123

app.py CHANGED Viewed

@@ -81,7 +81,7 @@ TAXONOMY = {
 LOBBY_DOMAINS = set()   # e.g., {"acme-lobby.com"}
 LEGAL_DOMAINS = set()   # e.g., {"biglaw.com","firmlaw.com"}
-def _contains_any(text: str, terms: list[str]) -> bool:
     if not text or not terms: return False
     tl = text.lower()
     return any(t for t in terms if t and t.lower() in tl)
@@ -111,7 +111,7 @@ TIE_MARGIN      = 1.0
 def route_email_row(row: pd.Series) -> str:
     text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
-    scores: dict[str,float] = {b: 0.0 for b in TAXONOMY.keys()}
     # lexicon points
     for b, terms in TAXONOMY.items():
         if not terms:
@@ -1194,7 +1194,245 @@ def compute_context_anomaly(df_in: pd.DataFrame) -> pd.DataFrame:
     df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
     return df.drop(columns=["_if_pct","_if_pts","_rule_pts","_corr_pts"], errors="ignore")
-# =================== Gradio UI ===================
 CSS = """
 :root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
 .email-card { background:#ffffff; color:#111827; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.08); }
@@ -1213,8 +1451,13 @@ hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
 .cursor { cursor:pointer; }
 """
-with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
     gr.Markdown("# Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans")
     with gr.Row():
         inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
@@ -1233,7 +1476,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
             lsa_dim = gr.Number(label="LSA components", value=256, precision=0)
             auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
-            k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
             mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
         with gr.Row():
             use_hdbscan = gr.Checkbox(label="Use HDBSCAN (auto-k, noise) on reduced vectors", value=False)
@@ -1263,14 +1506,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             sender_drop  = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
             lang_drop    = gr.Dropdown(label="Language", choices=["(any)"], value="(any)", allow_custom_value=False)
             sentiment_drop = gr.Dropdown(label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)")
-            tag_drop = gr.Dropdown(label="Tag", choices=["(any)", "🚩suspect", "finance", "off-channel", "odd-hours", "personal-mail"], value="(any)")
         with gr.Row():
             date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
             date_end   = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
             sort_by    = gr.Dropdown(label="Sort by", choices=["context_anomaly_score","corruption_score","date","anomaly_score","search_score"], value="context_anomaly_score")
             sort_dir   = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
-        # NEW: hide noise toggle
-        hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
     with gr.Row():
         run_btn = gr.Button("Process", variant="primary")
@@ -1287,6 +1535,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
         offhours_df = gr.Dataframe(label="Off-hours & personal-mail hits", interactive=False, wrap=True)
     gr.Markdown("### Search")
     with gr.Row():
         search_query = gr.Textbox(label="Search (keywords, names, etc.)")
@@ -1294,7 +1547,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     results_df = gr.Dataframe(label="Results (top 500 or top 50 for search)", interactive=True, wrap=True)
     email_view = gr.HTML(label="Reader")
-    # State
     state_df          = gr.State()
     state_vec         = gr.State()
     state_X_reduced   = gr.State()
@@ -1355,7 +1608,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         if bucket and bucket != "(any)":
             out = out[out["bucket"] == bucket]
         if cluster and cluster != "(any)":
-            # Modified to parse the new cluster label format
             m = re.match(r"^.*?(\-?\d+)\s+—", cluster)
             if m:
                 cid = int(m.group(1))
@@ -1387,7 +1639,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             out = out[out["cluster_id"] != -3]
         return out
-    # -------- Simple social network stats --------
     def social_stats(df: pd.DataFrame) -> pd.DataFrame:
         deg = {}
         def add_edge(a,b):
@@ -1405,19 +1657,104 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         out = pd.DataFrame(rows).sort_values("degree", ascending=False).head(50)
         return out
     # -------- Main pipeline --------
     def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
                      use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
                      trusted_domains_in, extra_keywords_in, highlight_toggle,
-                     # NEW:
                      use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
-                     per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary):
         if inbox_file is None:
-            return ("**Please upload a file.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
-                    None, None, None, None, None)
-        # === Vectorization & Clustering (UPGRADED) ===
         def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
             texts = list(df_in.apply(enrich_text, axis=1))
             subjects_only = list(df_in["subject"].fillna(""))
@@ -1516,6 +1853,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         def _cluster_space(
             X_space,
             df_part: pd.DataFrame,
             use_lsa: bool,
             use_hdbscan: bool,
@@ -1531,11 +1869,15 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         ):
             n = X_space.shape[0]
             if n <= 1:
                 labels = np.zeros((n,), dtype=int) if n == 1 else np.array([], dtype=int)
                 centers = None
                 chosen_k = int(n) if n > 0 else 0
-                return labels, centers, chosen_k
             if n < 10:
                 k_small = min(max(2, n // 2), n)
                 kmeans = MiniBatchKMeans(
@@ -1546,6 +1888,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 )
                 labels = kmeans.fit_predict(X_space)
                 centers = getattr(kmeans, "cluster_centers_", None)
                 return labels, centers, int(len(set(labels)))
             if use_hdbscan and HDBSCAN_OK and isinstance(X_space, np.ndarray) and X_space.shape[0] >= max(50, hdb_min_cluster):
@@ -1559,9 +1902,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 )
                 labels = clusterer.fit_predict(X_space)
                 centers = None
                 chosen_k = int(len(set([l for l in labels if l >= 0])))
                 return labels, centers, chosen_k
             if bool(auto_k):
                 if use_lsa and isinstance(X_space, np.ndarray):
                     k, _ = choose_k_by_kneedle(X_space, ks=(50, 100, 150, 200, 300, 400, 500))
@@ -1569,6 +1914,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                     k = auto_k_rule(X_space.shape[0])
             else:
                 k = max(10, int(k_clusters or 350))
             init = None
             if use_lsa and isinstance(X_space, np.ndarray) and count_vec is not None:
@@ -1590,9 +1936,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             centers = kmeans.cluster_centers_ if hasattr(kmeans, "cluster_centers_") else None
             if use_lsa and centers is not None:
                 labels = merge_close_clusters(labels, centers, thresh=0.95)
             chosen_k = int(len(set(labels)))
             return labels, centers, chosen_k
         trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
         extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
         extra_terms_lower = [t.lower() for t in extra_terms]
@@ -1600,8 +1948,15 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         recs = _load_json_records(inbox_file.name)
         if not recs:
             return ("**No valid records found.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
-                    None, None, None, None, None)
         normd = []
         for r in tqdm(recs, desc="Normalize", leave=False):
@@ -1611,20 +1966,28 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         df = pd.DataFrame(normd)
         if df.empty:
             return ("**No usable email records.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
-                    None, None, None, None, None)
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
         df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
         df = compute_sentiment_column(df)
-        # >>> NEW: stage-1 routing
         df["bucket"] = df.apply(route_email_row, axis=1)
         df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_domain", "")), axis=1)
         df["is_notify"] = df.apply(lambda r: is_notification_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_email", ""), r.get("from_domain", "")), axis=1)
         df.loc[df["is_news"] == True, "bucket"]   = "Newsletters/Alerts"
         df.loc[df["is_notify"] == True, "bucket"] = "IT/Security"
         flags = []
         for _, row in df.iterrows():
             f = []
@@ -1638,15 +2001,18 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             flags.append(f)
         df["flags"] = flags
         df_main = df[~df["bucket"].isin(["Newsletters/Alerts", "IT/Security"])].reset_index(drop=True)
         df_news = df[df["bucket"] == "Newsletters/Alerts"].reset_index(drop=True)
         df_alerts = df[df["bucket"] == "IT/Security"].reset_index(drop=True)
         kv = None
         emb_dim = 0
         if bool(use_embeddings):
             kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
         parts = []
         if bool(per_language):
             for bkt, g_bucket in df_main.groupby("bucket", dropna=False):
@@ -1657,11 +2023,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 parts.append(((bkt, "all"), grp.copy()))
         labels_list, cluster_name_list, anomaly_list = [], [], []
-        bucket_indexers = []  # keep index locations to reassign later
         X_reduced_holder = None
         term_names_global = {}
         single_partition = (len(parts) == 1)
-        d_word_agg, d_char_agg, k_agg = 0, 0, 0
         svd_obj_local, norm_obj_local = None, None
         for (bucket_name, _lang), df_part in parts:
@@ -1697,8 +2063,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 except Exception:
                     pass
-            labels, _, chosen_k = _cluster_space(
                 X_space=X_space,
                 df_part=df_part,
                 use_lsa=bool(use_lsa) and X_reduced is not None,
                 use_hdbscan=bool(use_hdbscan),
@@ -1713,25 +2080,21 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 d_word=d_word,
                 d_char=d_char,
             )
-            labels = stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35)
-            k_agg += len(set(labels))
             term_names = cluster_labels_pmi_bigram(
                 texts=texts, labels=labels, subjects=subjects_only,
                 topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
             )
-            # collect to write back in one go
             bucket_indexers.append(df_part.index)
             labels_list.append(pd.Series(labels, index=df_part.index))
             cluster_name_list.append(pd.Series([term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels], index=df_part.index))
             anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
-            # remember term names for the reader pill
             term_names_global.update({int(k): v for k, v in term_names.items()})
             if single_partition and X_reduced is not None:
                 X_reduced_holder = X_reduced
         if labels_list:
             df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_id"]   = pd.concat(labels_list).sort_index()
             df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_name"] = pd.concat(cluster_name_list).sort_index()
@@ -1741,6 +2104,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             df_main["cluster_name"] = "unclustered"
             df_main["anomaly_score"] = np.nan
         if len(df_news):
             df_news.loc[:, "cluster_id"] = -1
             df_news.loc[:, "cluster_name"] = "newsletter/news"
@@ -1750,10 +2114,22 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             df_alerts.loc[:, "cluster_name"] = "system/alerts"
             df_alerts.loc[:, "anomaly_score"] = np.nan
         df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
         df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
         df = compute_context_anomaly(df)
         index_obj = None
         use_faiss_flag = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
         if use_faiss_flag:
@@ -1770,6 +2146,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             except Exception:
                 pass
         cluster_counts = (
             df.groupby(["bucket", "cluster_id", "cluster_name"])
               .size()
@@ -1816,9 +2193,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         norm_obj_out = norm_obj_local if single_partition else None
         return (
-            status_md, cluster_counts, domain_counts, sender_counts, actors, offhours_table,
-            out_table, df, vec_state, X_reduced_holder, index_obj, term_names_global,
-            bool(use_lsa), use_faiss_flag,
             gr.update(choices=cluster_choices, value="(any)"),
             gr.update(choices=domain_choices,  value="(any)"),
             gr.update(choices=sender_choices,  value="(any)"),
@@ -1828,6 +2210,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             gr.update(choices=bucket_choices, value="(any)")
         )
     (run_btn.click)(
         process_file,
         inputs=[
@@ -1836,46 +2219,24 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             trusted_domains_in, extra_keywords_in, highlight_toggle,
             use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
             per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
         ],
         outputs=[
             status, cluster_counts_df, domain_counts_df, sender_counts_df,
-            actors_df, offhours_df, results_df,
-            state_df, state_vec, state_X_reduced, state_index, state_term_names,
             state_use_lsa, state_use_faiss,
             cluster_drop, domain_drop, sender_drop, lang_drop,
-            state_svd, state_norm, state_dims, state_extra_terms, state_highlight,
             bucket_drop,
         ],
     )
     # -------- Filtering & Search --------
-    def _sort_results(df, by, direction):
-        if df is None or len(df) == 0:
-            return pd.DataFrame()
-        tmp = df.copy()
-        if "date" in tmp.columns:
-            tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
-        else:
-            tmp["_dt"] = pd.NaT
-        by = by if by in tmp.columns else "context_anomaly_score"
-        asc = (direction == "asc")
-        sort_cols = [by]
-        if by == "date":
-            sort_cols = ["_dt"]
-        elif by in ["anomaly_score", "corruption_score", "context_anomaly_score"]:
-            sort_cols.append("_dt")
-        tmp = tmp.sort_values(sort_cols, ascending=[asc, False])
-        cols_out = [
-            "date","bucket","from_email","from_domain","subject","cluster_name","lang",
-            "tags","flags","sentiment","context_anomaly_score","corruption_score","anomaly_score"
-        ]
-        if "search_score" in tmp.columns:
-            cols_out.append("search_score")
-        return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
     def refresh_results(df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
         if df is None or len(df) == 0:
             return pd.DataFrame()
@@ -1884,7 +2245,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         )
         return _sort_results(filt, sort_by, sort_dir)
-    # Re-run when any filter control changes (including hide_noise)
     for ctrl in [bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
                  date_start, date_end, sort_by, sort_dir, hide_noise]:
         ctrl.change(
@@ -1911,57 +2272,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         outputs=[results_df],
     )
-    # -------- Search helpers --------
-    def _tokenize_query(q: str) -> List[str]:
-        return [p.strip() for p in re.split(r"\s+", q or "") if p.strip()][:8]
-    def _project_query_to_lsa(q_vec, svd, norm) -> Optional[np.ndarray]:
-        try:
-            return norm.transform(svd.transform(q_vec)).astype(np.float32)
-        except Exception:
-            return None
-    def _vectorize_query(q, vec_state, corpus_texts):
-        # Build the same features for the query that we used for docs
-        char_min_df = 1 if len(corpus_texts) <= 1 else 2
-        if vec_state.get("use_hashing"):
-            hv = HashingVectorizer(
-                analyzer="word",
-                ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
-                n_features=2 ** vec_state.get("hash_bits", 18),
-                token_pattern=TOKEN_PATTERN,
-                lowercase=True,
-                norm=None,
-                alternate_sign=False,
-            )
-            # Fit TF-IDF weights from corpus
-            counts = hv.transform(corpus_texts)
-            tfidf_tr = TfidfTransformer().fit(counts)
-            q_word = tfidf_tr.transform(hv.transform([q]))
-        else:
-            cv = CountVectorizer(
-                analyzer="word",
-                ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
-                max_features=vec_state.get("max_features"),
-                min_df=vec_state.get("min_df"),
-                max_df=vec_state.get("max_df"),
-                token_pattern=TOKEN_PATTERN,
-                lowercase=True,
-                stop_words=STOPWORD_FOR_VEC,
-                dtype=np.float32,
-            )
-            tf = cv.fit_transform(corpus_texts)
-            bm25 = BM25Transformer().fit(tf)
-            q_word = bm25.transform(cv.transform([q]))
-        char_vec = CharTfidf(
-            analyzer="char", ngram_range=(3, 5), min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32
-        ).fit(corpus_texts)
-        q_char = char_vec.transform([q])
-        return hstack([q_word, q_char * 0.20], format="csr")
     def search_fn(q, df, vec, X_red, index, use_lsa, use_faiss, svd, norm, sort, sdir):
         if not q or df is None or vec is None or index is None:
             return pd.DataFrame(), []
@@ -1984,7 +2295,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             return pd.DataFrame(), q_terms
         if isinstance(index, NearestNeighbors):
-            # brute-force cosine on reduced space
             if hasattr(index, "n_samples_fit_") and index.n_samples_fit_ <= 1:
                 return pd.DataFrame(), q_terms
             dists, inds = index.kneighbors(q_emb, n_neighbors=n_req)
@@ -2009,7 +2319,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         outputs=[results_df, state_query_terms],
     )
-    # -------- Reader selection (build highlighted HTML) --------
     def on_row_select(evt: gr.SelectData, table, df, term_names, q_terms, extra_terms, do_highlight):
         if evt.index is None or table is None or len(table) == 0 or df is None or len(df) == 0:
             return ""
@@ -2044,7 +2354,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         outputs=[email_view],
     )
-    # Click-to-filter conveniences for summary tables
     def on_click_filter(evt: gr.SelectData, df_sum: pd.DataFrame, col_name: str, out_comp: gr.Dropdown):
         if evt.index is None or df_sum is None or df_sum.empty:
             return gr.update()
@@ -2057,7 +2367,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         r = df_sum.iloc[evt.index[0]]
         return gr.update(value=r["bucket"]), gr.update(value=r["label"])
-    # Cluster summary → set bucket & cluster filter
     cluster_counts_df.select(
         on_cluster_summary_select, [cluster_counts_df], [bucket_drop, cluster_drop]
     ).then(
@@ -2069,7 +2378,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         outputs=[results_df],
     )
-    # Domain summary → set domain filter
     domain_counts_df.select(
         lambda evt, df: on_click_filter(evt, df, "from_domain", domain_drop), [domain_counts_df], [domain_drop]
     ).then(
@@ -2081,7 +2389,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         outputs=[results_df],
     )
-    # Sender summary → set sender filter
     sender_counts_df.select(
         lambda evt, df: on_click_filter(evt, df, "from_email", sender_drop), [sender_counts_df], [sender_drop]
     ).then(
@@ -2095,4 +2402,5 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
 if __name__ == "__main__":
     # Disable SSR to avoid handler arity warnings under server-side rendering
-    demo.launch(ssr_mode=False)

 LOBBY_DOMAINS = set()   # e.g., {"acme-lobby.com"}
 LEGAL_DOMAINS = set()   # e.g., {"biglaw.com","firmlaw.com"}
+def _contains_any(text: str, terms: list) -> bool:
     if not text or not terms: return False
     tl = text.lower()
     return any(t for t in terms if t and t.lower() in tl)
 def route_email_row(row: pd.Series) -> str:
     text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
+    scores: dict = {b: 0.0 for b in TAXONOMY.keys()}
     # lexicon points
     for b, terms in TAXONOMY.items():
         if not terms:
     df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
     return df.drop(columns=["_if_pct","_if_pts","_rule_pts","_corr_pts"], errors="ignore")
+# =================== 🔧 NEW: Per-bucket k & stabilizer params ===================
+def _bucket_k_multiplier(bucket_name: str) -> float:
+    b = (bucket_name or "").lower()
+    if b in ("constituent",):               return 1.25
+    if b in ("procurement", "campaign finance", "receipts/billing", "lobbyist"):  return 1.15
+    if b in ("scheduling", "other"):        return 1.00
+    if b in ("legal",):                     return 0.80
+    return 1.00
+def _bucket_stabilizer_params(bucket_name: str) -> Tuple[int, float, float]:
+    b = (bucket_name or "").lower()
+    if b == "legal":             return (30, 0.97, 0.38)
+    if b == "procurement":       return (35, 0.96, 0.36)
+    if b == "campaign finance":  return (35, 0.96, 0.36)
+    if b == "constituent":       return (40, 0.95, 0.33)
+    if b == "receipts/billing":  return (40, 0.95, 0.35)
+    if b == "scheduling":        return (35, 0.95, 0.35)
+    return (40, 0.96, 0.35)
+# =================== 🔧 NEW: Label de-dup helpers ===================
+def _normalize_label_tokens(label: str) -> set:
+    if not label: return set()
+    txt = str(label).lower()
+    toks = re.findall(r"[a-z\u0590-\u05FF][a-z\u0590-\u05FF\-']{1,}", txt)
+    toks2 = [t[:-1] if len(t) > 3 and t.endswith("s") else t for t in toks]
+    return {t for t in toks2 if t not in STOP_TERMS and t not in EN_STOP and t not in HE_STOP and len(t) >= 2}
+def _jaccard(a: set, b: set) -> float:
+    if not a or not b: return 0.0
+    inter = len(a & b)
+    if inter == 0: return 0.0
+    return inter / float(len(a | b))
+def dedupe_cluster_labels_in_bucket(df: pd.DataFrame, bucket: str, sim_thresh: float = 0.72) -> pd.DataFrame:
+    sel = df[df["bucket"] == bucket].copy()
+    if sel.empty or "cluster_name" not in sel.columns:
+        return df
+    names = sel[["cluster_id", "cluster_name"]].drop_duplicates()
+    tokens = {int(cid): _normalize_label_tokens(str(name)) for cid, name in names.values}
+    ids = list(tokens.keys())
+    parent = {i: i for i in ids}
+    def find(i):
+        while parent[i] != i: i = parent[i]
+        return i
+    def union(a, b):
+        ra, rb = find(a), find(b)
+        if ra != rb: parent[rb] = ra
+    for i in range(len(ids)):
+        for j in range(i+1, len(ids)):
+            if _jaccard(tokens[ids[i]], tokens[ids[j]]) >= sim_thresh:
+                union(ids[i], ids[j])
+    names_map = dict(names.values)
+    comp_to_canon = {}
+    for cid in ids:
+        root = find(cid)
+        comp_to_canon.setdefault(root, [])
+        comp_to_canon[root].append((cid, names_map.get(cid, "")))
+    canon_for_cluster = {}
+    for root, items in comp_to_canon.items():
+        best = max(items, key=lambda kv: (len(kv[1] or ""), kv[1]))
+        for cid, _ in items:
+            canon_for_cluster[cid] = best[1]
+    df.loc[sel.index, "cluster_name"] = sel["cluster_id"].map(lambda c: canon_for_cluster.get(int(c), names_map.get(int(c), "")))
+    return df
+def dedupe_all_labels(df: pd.DataFrame) -> pd.DataFrame:
+    out = df
+    for bkt in sorted(df["bucket"].dropna().unique()):
+        out = dedupe_cluster_labels_in_bucket(out, bkt, sim_thresh=0.72)
+    return out
+# =================== 🔎 NEW: Surveillance-campaign detection ===================
+SURV_KEYWORDS = [
+    "daily report","daily brief","briefing","sitreps","sitrep","situation report","summary",
+    "dossier","monitoring","tracking","watchlist","watch list","profile","surveillance",
+    "intel","intelligence","osint","open source intel","clippings","press clips","digest",
+    "alert","alerting","dispatch","bulletin","roundup","update"
+]
+SURV_RE = re.compile("|".join([re.escape(k) for k in SURV_KEYWORDS]), re.I)
+SUBJ_NUM_RE   = re.compile(r"\b\d{1,4}([,./-]\d{1,4})*\b")
+SUBJ_DATE_RE  = re.compile(r"\b(?:\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?|\d{4}-\d{2}-\d{2}|jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)\b", re.I)
+SUBJ_FW_RE    = re.compile(r"^\s*(re:|fw:|fwd:)\s*", re.I)
+EMAIL_RE      = re.compile(r"\b[\w.\-+%]+@[\w.-]+\.[A-Za-z]{2,}\b")
+def _candidate_entities_from_subjects(df: pd.DataFrame, extra_watchlist: List[str]) -> List[str]:
+    cand = set([w.strip() for w in (extra_watchlist or []) if w.strip()])
+    subs = df["subject"].dropna().astype(str).tolist()
+    pat = re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\b")
+    for s in subs:
+        for m in pat.finditer(s):
+            name = m.group(1).strip()
+            if name.lower() in EN_STOP or len(name) < 5:
+                continue
+            cand.add(name)
+    out = sorted(cand)
+    return out[:3000]
+def _normalize_subject_template(subj: str, entity: str) -> str:
+    if not subj: return ""
+    s = SUBJ_FW_RE.sub("", subj)
+    try:
+        s = re.sub(re.escape(entity), "«ENTITY»", s, flags=re.I)
+    except Exception:
+        pass
+    s = SUBJ_DATE_RE.sub("«DATE»", s)
+    s = SUBJ_NUM_RE.sub("«NUM»", s)
+    s = EMAIL_RE.sub("«EMAIL»", s)
+    s = re.sub(r"\s+", " ", s).strip().lower()
+    return s
+def _entity_mask_present(row: pd.Series, entity: str) -> bool:
+    t = (row.get("subject","") + " " + row.get("body_text","")).lower()
+    e = (entity or "").lower()
+    return (e in t) if e else False
+def detect_surveillance_campaigns(
+    df: pd.DataFrame,
+    watchlist: Optional[List[str]] = None,
+    min_mentions: int = 15,
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    if df.empty:
+        return pd.DataFrame(), pd.DataFrame()
+    watch = [w.strip() for w in (watchlist or []) if w.strip()]
+    cands = _candidate_entities_from_subjects(df, watch)
+    dfd = df.copy()
+    dfd["_dt"] = pd.to_datetime(dfd["date"], utc=True, errors="coerce")
+    dfd["_day"] = dfd["_dt"].dt.date
+    dfd["_week"] = dfd["_dt"].dt.to_period("W").astype(str)
+    dfd["_is_news"] = (dfd["bucket"] == "Newsletters/Alerts")
+    dfd["_is_it"]   = (dfd["bucket"] == "IT/Security")
+    dfd["_is_internal"] = ~(dfd["_is_news"] | dfd["_is_it"])
+    dfd["_recips"] = dfd["to_emails"].apply(lambda xs: len(xs) if isinstance(xs, list) else 0)
+    rows = []
+    samples = []
+    for entity in cands:
+        mask = dfd.apply(lambda r: _entity_mask_present(r, entity), axis=1)
+        grp = dfd[mask]
+        n = len(grp)
+        if n < int(min_mentions):
+            continue
+        n_senders  = grp["from_email"].nunique()
+        n_domains  = grp["from_domain"].nunique()
+        pct_news   = float((grp["_is_news"].mean() if n else 0.0))
+        pct_int    = float((grp["_is_internal"].mean() if n else 0.0))
+        avg_recips = float((grp["_recips"].mean() if n else 0.0))
+        wk = grp.groupby("_week").size().astype(float)
+        if len(wk) >= 4:
+            baseline = wk.iloc[:-1]
+            mu = float(baseline.mean()) if len(baseline) else 0.0
+            sd = float(baseline.std(ddof=1)) if len(baseline) > 1 else 0.0
+            last = float(wk.iloc[-1])
+            weekly_peak_z = 0.0 if sd == 0.0 else (last - mu) / sd
+        else:
+            weekly_peak_z = 0.0
+        norm_subj = grp["subject"].fillna("").astype(str).map(lambda s: _normalize_subject_template(s, entity))
+        if len(norm_subj):
+            top_template_share = norm_subj.value_counts(normalize=True).iloc[0]
+        else:
+            top_template_share = 0.0
+        kw_share = float(((grp["subject"].fillna("") + " " + grp["body_text"].fillna("")).str.contains(SURV_RE).mean()) if n else 0.0)
+        score = 0.0
+        score += 2.5 * min(1.0, top_template_share)
+        score += 2.0 * min(1.0, kw_share)
+        score += 1.5 * min(1.0, weekly_peak_z / 3.0)
+        score += 0.8 * min(1.0, n_senders / 10.0)
+        score += 0.5 * min(1.0, n_domains / 10.0)
+        score += 1.0 * pct_int
+        score += 0.3 * min(1.0, avg_recips / 10.0)
+        level = "info"
+        if score >= 6.5: level = "likely"
+        elif score >= 4.5: level = "possible"
+        first_d = grp["_dt"].min()
+        last_d  = grp["_dt"].max()
+        rows.append({
+            "entity": entity,
+            "surveillance_score": round(float(score), 3),
+            "level": level,
+            "n_emails": int(n),
+            "n_senders": int(n_senders),
+            "n_domains": int(n_domains),
+            "pct_newsletters": round(pct_news, 3),
+            "pct_internal": round(pct_int, 3),
+            "avg_recipients": round(avg_recips, 2),
+            "weekly_peak_z": round(float(weekly_peak_z), 3),
+            "template_max_share": round(float(top_template_share), 3),
+            "keyword_share": round(float(kw_share), 3),
+            "first_date": str(first_d) if pd.notna(first_d) else "",
+            "last_date":  str(last_d) if pd.notna(last_d)  else "",
+            "notes": "template/keywords/cadence/senders/domains mix"
+        })
+        ex = grp[["date","from_email","from_domain","subject","bucket"]].copy().head(8)
+        ex.insert(0, "entity", entity)
+        samples.append(ex)
+    ent_df = pd.DataFrame(rows).sort_values(["surveillance_score","n_emails"], ascending=[False, False]).head(200)
+    samp_df = pd.concat(samples, ignore_index=True) if samples else pd.DataFrame()
+    return ent_df, samp_df
+def tag_surveillance_emails(df: pd.DataFrame, ent_df: pd.DataFrame, threshold: float = 4.5) -> pd.DataFrame:
+    if df.empty or ent_df.empty:
+        return df
+    hot = ent_df[ent_df["surveillance_score"] >= float(threshold)]["entity"].tolist()
+    if not hot: return df
+    def _tag(row):
+        txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
+        tags = set(row.get("tags") or [])
+        for e in hot:
+            if e.lower() in txt:
+                tags.add("surveillance")
+                break
+        return sorted(tags)
+    out = df.copy()
+    out["tags"] = out.apply(_tag, axis=1)
+    return out
+# =================== UI / PIPELINE CONTINUATION ===================
+# ---------- Styles ----------
 CSS = """
 :root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
 .email-card { background:#ffffff; color:#111827; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.08); }
 .cursor { cursor:pointer; }
 """
+# ---------- App ----------
+with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Surveillance Radar", css=CSS, theme="soft") as demo:
     gr.Markdown("# Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans")
+    gr.Markdown(
+        "This build includes per-bucket **k** heuristics, label **de-dup**, and a **surveillance-campaign detector** "
+        "(template cadence + keywords + multi-sender/domain signals)."
+    )
     with gr.Row():
         inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
             use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
             lsa_dim = gr.Number(label="LSA components", value=256, precision=0)
             auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
+            k_clusters = gr.Number(label="Base k (before per-bucket multiplier)", value=350, precision=0)
             mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
         with gr.Row():
             use_hdbscan = gr.Checkbox(label="Use HDBSCAN (auto-k, noise) on reduced vectors", value=False)
             sender_drop  = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
             lang_drop    = gr.Dropdown(label="Language", choices=["(any)"], value="(any)", allow_custom_value=False)
             sentiment_drop = gr.Dropdown(label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)")
+            tag_drop = gr.Dropdown(label="Tag", choices=["(any)", "🚩suspect", "finance", "off-channel", "surveillance", "odd-hours", "personal-mail"], value="(any)")
         with gr.Row():
             date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
             date_end   = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
             sort_by    = gr.Dropdown(label="Sort by", choices=["context_anomaly_score","corruption_score","date","anomaly_score","search_score"], value="context_anomaly_score")
             sort_dir   = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
+        with gr.Row():
+            hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
+    with gr.Accordion("Surveillance Radar", open=True):
+        with gr.Row():
+            watchlist_in = gr.Textbox(label="Watchlist (names or entities, comma-separated)", value="Hillary Clinton, Joe Biden, Donald Trump")
+            min_mentions = gr.Number(label="Min mentions per entity", value=15, precision=0)
     with gr.Row():
         run_btn = gr.Button("Process", variant="primary")
         actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
         offhours_df = gr.Dataframe(label="Off-hours & personal-mail hits", interactive=False, wrap=True)
+    gr.Markdown("### Surveillance Campaigns (detected entities)")
+    with gr.Row():
+        surv_entities_df = gr.Dataframe(label="Entities ranked by surveillance score", interactive=False, wrap=True)
+        surv_samples_df = gr.Dataframe(label="Sample emails for highlighted entities", interactive=False, wrap=True)
     gr.Markdown("### Search")
     with gr.Row():
         search_query = gr.Textbox(label="Search (keywords, names, etc.)")
     results_df = gr.Dataframe(label="Results (top 500 or top 50 for search)", interactive=True, wrap=True)
     email_view = gr.HTML(label="Reader")
+    # -------- State --------
     state_df          = gr.State()
     state_vec         = gr.State()
     state_X_reduced   = gr.State()
         if bucket and bucket != "(any)":
             out = out[out["bucket"] == bucket]
         if cluster and cluster != "(any)":
             m = re.match(r"^.*?(\-?\d+)\s+—", cluster)
             if m:
                 cid = int(m.group(1))
             out = out[out["cluster_id"] != -3]
         return out
+    # -------- Social graph summary --------
     def social_stats(df: pd.DataFrame) -> pd.DataFrame:
         deg = {}
         def add_edge(a,b):
         out = pd.DataFrame(rows).sort_values("degree", ascending=False).head(50)
         return out
+    # -------- Sorting helper --------
+    def _sort_results(df, by, direction):
+        if df is None or len(df) == 0:
+            return pd.DataFrame()
+        tmp = df.copy()
+        if "date" in tmp.columns:
+            tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
+        else:
+            tmp["_dt"] = pd.NaT
+        by = by if by in tmp.columns else "context_anomaly_score"
+        asc = (direction == "asc")
+        sort_cols = [by]
+        if by == "date":
+            sort_cols = ["_dt"]
+        elif by in ["anomaly_score", "corruption_score", "context_anomaly_score"]:
+            sort_cols.append("_dt")
+        tmp = tmp.sort_values(sort_cols, ascending=[asc, False])
+        cols_out = [
+            "date","bucket","from_email","from_domain","subject","cluster_name","lang",
+            "tags","flags","sentiment","context_anomaly_score","corruption_score","anomaly_score"
+        ]
+        if "search_score" in tmp.columns:
+            cols_out.append("search_score")
+        return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
+    # -------- Vectorization helpers (mirror training path for queries) --------
+    def _tokenize_query(q: str) -> List[str]:
+        return [p.strip() for p in re.split(r"\s+", q or "") if p.strip()][:8]
+    def _project_query_to_lsa(q_vec, svd, norm) -> Optional[np.ndarray]:
+        try:
+            return norm.transform(svd.transform(q_vec)).astype(np.float32)
+        except Exception:
+            return None
+    def _vectorize_query(q, vec_state, corpus_texts):
+        # Build the same features for the query that we used for docs
+        char_min_df = 1 if len(corpus_texts) <= 1 else 2
+        if vec_state.get("use_hashing"):
+            hv = HashingVectorizer(
+                analyzer="word",
+                ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
+                n_features=2 ** vec_state.get("hash_bits", 18),
+                token_pattern=TOKEN_PATTERN,
+                lowercase=True,
+                norm=None,
+                alternate_sign=False,
+            )
+            counts = hv.transform(corpus_texts)
+            tfidf_tr = TfidfTransformer().fit(counts)
+            q_word = tfidf_tr.transform(hv.transform([q]))
+        else:
+            cv = CountVectorizer(
+                analyzer="word",
+                ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
+                max_features=vec_state.get("max_features"),
+                min_df=vec_state.get("min_df"),
+                max_df=vec_state.get("max_df"),
+                token_pattern=TOKEN_PATTERN,
+                lowercase=True,
+                stop_words=STOPWORD_FOR_VEC,
+                dtype=np.float32,
+            )
+            tf = cv.fit_transform(corpus_texts)
+            bm25 = BM25Transformer().fit(tf)
+            q_word = bm25.transform(cv.transform([q]))
+        char_vec = CharTfidf(
+            analyzer="char", ngram_range=(3, 5), min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32
+        ).fit(corpus_texts)
+        q_char = char_vec.transform([q])
+        return hstack([q_word, q_char * 0.20], format="csr")
     # -------- Main pipeline --------
     def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
                      use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
                      trusted_domains_in, extra_keywords_in, highlight_toggle,
                      use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
+                     per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
+                     watchlist_in, min_mentions):
         if inbox_file is None:
+            return (
+                "**Please upload a file.**",
+                None, None, None, None, None,
+                None, None,  # surveillance outputs
+                None,  # results_df
+                None, None, None,  # states df/vec/X
+                None, None,        # index & term names
+                None, None,        # flags
+                gr.update(), gr.update(), gr.update(), gr.update(),  # dropdowns
+                None, None, None,  # svd/norm/dims
+                None, None,        # extra terms / highlight
+                gr.update()        # bucket list
+            )
+        # === Inner helpers for this function ===
         def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
             texts = list(df_in.apply(enrich_text, axis=1))
             subjects_only = list(df_in["subject"].fillna(""))
         def _cluster_space(
             X_space,
+            bucket_name: str,
             df_part: pd.DataFrame,
             use_lsa: bool,
             use_hdbscan: bool,
         ):
             n = X_space.shape[0]
+            # Per-bucket stabilizer params
+            min_size, merge_th, reassign_th = _bucket_stabilizer_params(bucket_name)
             if n <= 1:
                 labels = np.zeros((n,), dtype=int) if n == 1 else np.array([], dtype=int)
                 centers = None
                 chosen_k = int(n) if n > 0 else 0
+                return stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th), centers, chosen_k
             if n < 10:
                 k_small = min(max(2, n // 2), n)
                 kmeans = MiniBatchKMeans(
                 )
                 labels = kmeans.fit_predict(X_space)
                 centers = getattr(kmeans, "cluster_centers_", None)
+                labels = stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th)
                 return labels, centers, int(len(set(labels)))
             if use_hdbscan and HDBSCAN_OK and isinstance(X_space, np.ndarray) and X_space.shape[0] >= max(50, hdb_min_cluster):
                 )
                 labels = clusterer.fit_predict(X_space)
                 centers = None
+                labels = stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th)
                 chosen_k = int(len(set([l for l in labels if l >= 0])))
                 return labels, centers, chosen_k
+            # Choose k (global rule or kneedle), then per-bucket multiplier
             if bool(auto_k):
                 if use_lsa and isinstance(X_space, np.ndarray):
                     k, _ = choose_k_by_kneedle(X_space, ks=(50, 100, 150, 200, 300, 400, 500))
                     k = auto_k_rule(X_space.shape[0])
             else:
                 k = max(10, int(k_clusters or 350))
+            k = int(max(2, round(k * _bucket_k_multiplier(bucket_name))))
             init = None
             if use_lsa and isinstance(X_space, np.ndarray) and count_vec is not None:
             centers = kmeans.cluster_centers_ if hasattr(kmeans, "cluster_centers_") else None
             if use_lsa and centers is not None:
                 labels = merge_close_clusters(labels, centers, thresh=0.95)
+            labels = stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th)
             chosen_k = int(len(set(labels)))
             return labels, centers, chosen_k
+        # ---- Begin processing ----
         trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
         extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
         extra_terms_lower = [t.lower() for t in extra_terms]
         recs = _load_json_records(inbox_file.name)
         if not recs:
             return ("**No valid records found.**",
+                    None, None, None, None, None,
+                    None, None,
+                    None,
+                    None, None, None,
+                    None, None,
+                    None, None, None, None,
+                    None, None, None,
+                    None, None,
+                    None)
         normd = []
         for r in tqdm(recs, desc="Normalize", leave=False):
         df = pd.DataFrame(normd)
         if df.empty:
             return ("**No usable email records.**",
+                    None, None, None, None, None,
+                    None, None,
+                    None,
+                    None, None, None,
+                    None, None,
+                    None, None, None, None,
+                    None, None, None,
+                    None, None,
+                    None)
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
         df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
         df = compute_sentiment_column(df)
+        # Stage-1 routing (bucketing)
         df["bucket"] = df.apply(route_email_row, axis=1)
         df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_domain", "")), axis=1)
         df["is_notify"] = df.apply(lambda r: is_notification_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_email", ""), r.get("from_domain", "")), axis=1)
         df.loc[df["is_news"] == True, "bucket"]   = "Newsletters/Alerts"
         df.loc[df["is_notify"] == True, "bucket"] = "IT/Security"
+        # Flags
         flags = []
         for _, row in df.iterrows():
             f = []
             flags.append(f)
         df["flags"] = flags
+        # Split out stable buckets
         df_main = df[~df["bucket"].isin(["Newsletters/Alerts", "IT/Security"])].reset_index(drop=True)
         df_news = df[df["bucket"] == "Newsletters/Alerts"].reset_index(drop=True)
         df_alerts = df[df["bucket"] == "IT/Security"].reset_index(drop=True)
+        # Optional embeddings
         kv = None
         emb_dim = 0
         if bool(use_embeddings):
             kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
+        # Build partitions: per language within bucket if requested
         parts = []
         if bool(per_language):
             for bkt, g_bucket in df_main.groupby("bucket", dropna=False):
                 parts.append(((bkt, "all"), grp.copy()))
         labels_list, cluster_name_list, anomaly_list = [], [], []
+        bucket_indexers = []
         X_reduced_holder = None
         term_names_global = {}
         single_partition = (len(parts) == 1)
+        d_word_agg, d_char_agg = 0, 0
         svd_obj_local, norm_obj_local = None, None
         for (bucket_name, _lang), df_part in parts:
                 except Exception:
                     pass
+            labels, _, _ = _cluster_space(
                 X_space=X_space,
+                bucket_name=bucket_name,
                 df_part=df_part,
                 use_lsa=bool(use_lsa) and X_reduced is not None,
                 use_hdbscan=bool(use_hdbscan),
                 d_word=d_word,
                 d_char=d_char,
             )
             term_names = cluster_labels_pmi_bigram(
                 texts=texts, labels=labels, subjects=subjects_only,
                 topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
             )
             bucket_indexers.append(df_part.index)
             labels_list.append(pd.Series(labels, index=df_part.index))
             cluster_name_list.append(pd.Series([term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels], index=df_part.index))
             anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
             term_names_global.update({int(k): v for k, v in term_names.items()})
             if single_partition and X_reduced is not None:
                 X_reduced_holder = X_reduced
         if labels_list:
             df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_id"]   = pd.concat(labels_list).sort_index()
             df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_name"] = pd.concat(cluster_name_list).sort_index()
             df_main["cluster_name"] = "unclustered"
             df_main["anomaly_score"] = np.nan
+        # Assign fixed ids for news/alerts buckets
         if len(df_news):
             df_news.loc[:, "cluster_id"] = -1
             df_news.loc[:, "cluster_name"] = "newsletter/news"
             df_alerts.loc[:, "cluster_name"] = "system/alerts"
             df_alerts.loc[:, "anomaly_score"] = np.nan
+        # Merge back
         df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
+        # Label de-dup pass per-bucket
+        df = dedupe_all_labels(df)
+        # Scores
         df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
         df = compute_context_anomaly(df)
+        # Surveillance campaigns
+        wl = [w.strip() for w in (watchlist_in or "").split(",") if w.strip()]
+        ent_df, samp_df = detect_surveillance_campaigns(df, watchlist=wl, min_mentions=int(min_mentions or 15))
+        df = tag_surveillance_emails(df, ent_df, threshold=4.5)
+        # Build indexes/search
         index_obj = None
         use_faiss_flag = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
         if use_faiss_flag:
             except Exception:
                 pass
+        # Summaries
         cluster_counts = (
             df.groupby(["bucket", "cluster_id", "cluster_name"])
               .size()
         norm_obj_out = norm_obj_local if single_partition else None
         return (
+            status_md,                  # status
+            cluster_counts, domain_counts, sender_counts,  # summaries
+            actors, offhours_table,                        # extra summaries
+            ent_df, samp_df,                               # surveillance tables
+            out_table,                                     # results table
+            df, vec_state, X_reduced_holder,               # states
+            index_obj, term_names_global,                  # index + labels
+            bool(use_lsa), use_faiss_flag,                 # flags
             gr.update(choices=cluster_choices, value="(any)"),
             gr.update(choices=domain_choices,  value="(any)"),
             gr.update(choices=sender_choices,  value="(any)"),
             gr.update(choices=bucket_choices, value="(any)")
         )
+    # Bind Process button
     (run_btn.click)(
         process_file,
         inputs=[
             trusted_domains_in, extra_keywords_in, highlight_toggle,
             use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
             per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
+            watchlist_in, min_mentions
         ],
         outputs=[
             status, cluster_counts_df, domain_counts_df, sender_counts_df,
+            actors_df, offhours_df,
+            surv_entities_df, surv_samples_df,
+            results_df,
+            state_df, state_vec, state_X_reduced,
+            state_index, state_term_names,
             state_use_lsa, state_use_faiss,
             cluster_drop, domain_drop, sender_drop, lang_drop,
+            state_svd, state_norm, state_dims,
+            state_extra_terms, state_highlight,
             bucket_drop,
         ],
     )
     # -------- Filtering & Search --------
     def refresh_results(df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
         if df is None or len(df) == 0:
             return pd.DataFrame()
         )
         return _sort_results(filt, sort_by, sort_dir)
+    # Re-run when any filter control changes
     for ctrl in [bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
                  date_start, date_end, sort_by, sort_dir, hide_noise]:
         ctrl.change(
         outputs=[results_df],
     )
+    # --- Search ---
     def search_fn(q, df, vec, X_red, index, use_lsa, use_faiss, svd, norm, sort, sdir):
         if not q or df is None or vec is None or index is None:
             return pd.DataFrame(), []
             return pd.DataFrame(), q_terms
         if isinstance(index, NearestNeighbors):
             if hasattr(index, "n_samples_fit_") and index.n_samples_fit_ <= 1:
                 return pd.DataFrame(), q_terms
             dists, inds = index.kneighbors(q_emb, n_neighbors=n_req)
         outputs=[results_df, state_query_terms],
     )
+    # --- Reader selection (highlighting) ---
     def on_row_select(evt: gr.SelectData, table, df, term_names, q_terms, extra_terms, do_highlight):
         if evt.index is None or table is None or len(table) == 0 or df is None or len(df) == 0:
             return ""
         outputs=[email_view],
     )
+    # --- Click-to-filter helpers ---
     def on_click_filter(evt: gr.SelectData, df_sum: pd.DataFrame, col_name: str, out_comp: gr.Dropdown):
         if evt.index is None or df_sum is None or df_sum.empty:
             return gr.update()
         r = df_sum.iloc[evt.index[0]]
         return gr.update(value=r["bucket"]), gr.update(value=r["label"])
     cluster_counts_df.select(
         on_cluster_summary_select, [cluster_counts_df], [bucket_drop, cluster_drop]
     ).then(
         outputs=[results_df],
     )
     domain_counts_df.select(
         lambda evt, df: on_click_filter(evt, df, "from_domain", domain_drop), [domain_counts_df], [domain_drop]
     ).then(
         outputs=[results_df],
     )
     sender_counts_df.select(
         lambda evt, df: on_click_filter(evt, df, "from_email", sender_drop), [sender_counts_df], [sender_drop]
     ).then(
 if __name__ == "__main__":
     # Disable SSR to avoid handler arity warnings under server-side rendering
+    demo.launch(ssr_mode=False)