Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Sep 1, 2025

Commit

abfac2f

verified ·

1 Parent(s): 3e01db4

Update app.py

Browse files

Files changed (1) hide show

app.py +197 -58

app.py CHANGED Viewed

@@ -227,17 +227,28 @@ YEAR_RE = re.compile(r"^(19|20)\d{2}$")
 NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
 ONE_CHAR_RE = re.compile(r"^.$")
 # This stoplist is used by the CountVectorizer (MUST be list for sklearn)
 STOPWORD_FOR_VEC = sorted(EN_STOP | HE_STOP | STOP_TERMS)
 def _is_junk_term(t: str) -> bool:
-    tl = t.lower()
-    if tl in STOP_TERMS or tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
-        return True
     if EMAIL_LIKE_RE.search(tl): return True
     if YEAR_RE.match(tl): return True
     if NUMERIC_RE.match(tl): return True
     if ONE_CHAR_RE.match(tl): return True
     return False
 def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
@@ -686,13 +697,22 @@ def enrich_text(row: pd.Series) -> str:
     tokens.append(lang_tok)
     return (t + " " + " ".join(tokens)).strip()
-# =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST ===================
-def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alpha=0.75, global_ubiq_cut=0.20):
     """
     Improved labeler:
       - Considers bigrams AND trigrams (PMI vs. global)
       - Class-TFIDF unigrams with subject coverage boost
       - Suppresses globally ubiquitous tokens/phrases (appear in >20% docs by default)
     """
     import math as _math
     from collections import Counter, defaultdict
@@ -704,12 +724,13 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
     def is_junk_token(tok: str) -> bool:
         if _is_junk_term(tok): return True
         tl = tok.lower()
-        if tl.startswith("__"): return True
-        if tl in STOP_TERMS: return True
-        if tl in HEADER_STOP: return True
         if "@" in tl: return True
         if tl.isascii() and len(tl) <= 2: return True
-        if re.search(r"[^\w\-']", tl) and "’" not in tl and "'" not in tl: return True
         return False
     def tokenize_clean(t):
@@ -769,7 +790,7 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
     for c in sorted(set(int(x) for x in labels)):
         n_docs_c = max(1, per_c_doc_count[c])
-        # PMI bigrams & trigrams with subject-coverage boost
         phrases = []
         for store, glob_df, subj_docs, n in (
             (per_c_bg[c],  glob_df_bg,  per_c_subj_bg_docs[c], 2),
@@ -788,14 +809,17 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
                     cov = 0.0
                     if have_subjects:
                         cov = subj_docs[ng] / n_docs_c
                     score += subject_alpha * cov
-                    scored.append((score, ng))
-            scored.sort(reverse=True)
-            # take a couple from each class to avoid only-bigrams/only-trigrams
             take = max(1, topn // (3 if n == 3 else 2))
-            phrases.extend([p for _, p in scored[:take]])
-        # Class-TFIDF unigrams with subject coverage boost
         docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
         docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
         corpus = [docs_c[0], docs_bg[0]]
@@ -807,16 +831,24 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
         vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
         row = X[0].toarray().ravel()
-        # Subject coverage vector
         subj_cov = np.zeros_like(row)
         if have_subjects:
-            vocab_index = {t:i for i,t in enumerate(vocab)}
             for tok, cnt_docs in per_c_subj_uni_docs[c].items():
                 if tok in vocab_index and not is_junk_token(tok):
-                    subj_cov[vocab_index[tok]] = cnt_docs / n_docs_c
         row_boosted = row + subject_alpha * subj_cov
-        order = row_boosted.argsort()[::-1]
         unis = []
         for i in order:
             tok = vocab[i]
@@ -924,13 +956,95 @@ def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVect
         return seeds_red
     return None
 # =================== Scoring & Flags ===================
 def _hour_of(dt_iso: str) -> Optional[int]:
     try:
         if not dt_iso: return None
         dt = pd.to_datetime(dt_iso, utc=True, errors="coerce")
         if pd.isna(dt): return None
-        # treat UTC for lack of per-user tz; still useful as "odd hour"
         return int(dt.hour)
     except Exception:
         return None
@@ -960,11 +1074,9 @@ def corruption_score(row, trusted_domains: set):
     body_len = len(row.get("body_text",""))
     if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
         score += 0.5
-    # personal/off-channel via headers
     fd = (row.get("from_domain") or "").lower()
     if fd in PERSONAL_DOMAINS and fd not in trusted_domains:
         score += 0.5
-    # odd hours
     h = _hour_of(row.get("date") or "")
     if h is not None and (h < 6 or h > 22):
         score += 0.3
@@ -1044,6 +1156,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             date_end   = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
             sort_by    = gr.Dropdown(label="Sort by", choices=["corruption_score","date","anomaly_score","search_score"], value="corruption_score")
             sort_dir   = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
     with gr.Row():
         run_btn = gr.Button("Process", variant="primary")
@@ -1121,6 +1235,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         tag_value: str,
         start: str,
         end: str,
     ) -> pd.DataFrame:
         out = df
         if cluster and cluster != "(any)":
@@ -1151,11 +1266,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 out = out[pd.to_datetime(out["date"], utc=True, errors="coerce") <= dt]
             except Exception:
                 pass
         return out
     # -------- Simple social network stats --------
     def social_stats(df: pd.DataFrame) -> pd.DataFrame:
-        # degree = unique counterparts per address (from <-> each to/cc)
         deg = {}
         def add_edge(a,b):
             if not a or not b or a==b: return
@@ -1186,7 +1302,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         # === Vectorization & Clustering (UPGRADED) ===
         def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
-            # “texts” feed vectorizers; “subjects_only” helps labels
             texts = list(df_in.apply(enrich_text, axis=1))
             subjects_only = list(df_in["subject"].fillna(""))
             return texts, subjects_only
@@ -1228,7 +1343,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 count_vec = None; bm25 = None
                 return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full
-            # Original Count -> BM25 + char
             count_vec = CountVectorizer(
                 analyzer="word", ngram_range=ngram_range,
                 max_features=int(max_features) if max_features else None,
@@ -1258,12 +1372,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             n_docs = X_full.shape[0]
             n_feats = X_full.shape[1]
-            # valid bound for TruncatedSVD: 1 <= n_components < min(n_docs, n_feats)
             max_components = max(1, min(n_docs, n_feats) - 1)
             n_comp = int(min(int(lsa_dim or 256), max_components))
             if n_comp < 2:
-                # too small to reduce meaningfully — skip LSA
                 return None, None, None
             svd_obj = TruncatedSVD(n_components=n_comp, random_state=0)
@@ -1274,9 +1386,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             return X_reduced, svd_obj, norm_obj
         def _attach_embeddings(texts, X_reduced_or_full, use_lsa, kv, emb_dim, weight):
-            """
-            Concatenate averaged word vectors (dense) into the current space.
-            """
             if kv is None or emb_dim <= 0 or weight <= 0.0:
                 return X_reduced_or_full, emb_dim
             doc_embs = _build_doc_embeddings(texts, kv, emb_dim).astype(np.float32)
@@ -1303,19 +1412,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             norm_obj,
             d_word, d_char
         ):
-            """
-            Run HDBSCAN (if requested and available) or MiniBatchKMeans.
-            """
             n = X_space.shape[0]
-            # NEW: trivial/tiny partition handling
             if n <= 1:
                 labels = np.zeros((n,), dtype=int) if n == 1 else np.array([], dtype=int)
                 centers = None
                 chosen_k = int(n) if n > 0 else 0
                 return labels, centers, chosen_k
             if n < 10:
-                # avoid unstable large-k on tiny sets
                 k_small = min(max(2, n // 2), n)
                 kmeans = MiniBatchKMeans(
                     n_clusters=int(k_small),
@@ -1341,7 +1445,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 chosen_k = int(len(set([l for l in labels if l >= 0])))
                 return labels, centers, chosen_k
-            # Otherwise MiniBatchKMeans
             if bool(auto_k):
                 if use_lsa and isinstance(X_space, np.ndarray):
                     k, _ = choose_k_by_kneedle(X_space, ks=(50, 100, 150, 200, 300, 400, 500))
@@ -1373,7 +1476,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             chosen_k = int(len(set(labels)))
             return labels, centers, chosen_k
-        # Main logic starts
         trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
         extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
         extra_terms_lower = [t.lower() for t in extra_terms]
@@ -1424,7 +1526,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         if bool(use_embeddings):
             kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
-        # >>> FIX: keep original indices when splitting (no reset_index here)
         parts = []
         if bool(per_language) and "lang" in df_main.columns:
             for lang_code, grp in df_main.groupby("lang", dropna=False):
@@ -1488,18 +1589,26 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 d_word=d_word,
                 d_char=d_char,
             )
-            k_agg += chosen_k
             term_names = cluster_labels_pmi_bigram(
-                texts=texts, labels=labels, subjects=subjects_only, topn=6, subject_alpha=0.75, global_ubiq_cut=0.20
             )
             term_names_global.update({int(k): v for k, v in term_names.items()})
-            # >>> We kept original indices in df_part, so Series align to df_main on concat
             labels_list.append(pd.Series(labels, index=df_part.index))
             cluster_name_list.append(
                 pd.Series(
-                    [term_names.get(int(c), f"noise_{int(c)}" if c < 0 else f"cluster_{int(c)}") for c in labels],
                     index=df_part.index,
                 )
             )
@@ -1517,7 +1626,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             df_main["cluster_name"] = "unclustered"
             df_main["anomaly_score"] = np.nan
-        # >>> Use .loc to avoid chained-assignment warnings
         if len(df_news):
             df_news.loc[:, "cluster_id"] = -1
             df_news.loc[:, "cluster_name"] = "newsletter/news"
@@ -1546,7 +1654,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             except Exception:
                 pass
-        # Summaries and UI updates...
         cluster_counts = (
             df.groupby(["cluster_id", "cluster_name"])
             .size()
@@ -1585,7 +1692,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             "max_df": float(max_df),
             "use_bigrams": bool(use_bigrams),
         }
-        status_md = f"**Processed {len(df):,} emails** | k = {k_agg}"
         svd_obj_out = svd_obj_local if single_partition else None
         norm_obj_out = norm_obj_local if single_partition else None
@@ -1710,13 +1817,17 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
-    def refresh_results(df, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir):
         if df is None or len(df) == 0:
             return pd.DataFrame()
-        filt = _apply_filters(df, cluster, domain, sender, lang, sentiment, tag, start, end)
         return _sort_results(filt, sort_by, sort_dir)
-    for ctrl in [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir]:
         ctrl.change(
             refresh_results,
             inputs=[
@@ -1731,14 +1842,17 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 date_end,
                 sort_by,
                 sort_dir,
             ],
             outputs=[results_df],
         )
     reset_btn.click(
-        lambda: ["(any)"] * 6 + [""] * 2 + ["corruption_score", "desc"],
         [],
-        [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
     ).then(
         refresh_results,
         inputs=[
@@ -1753,21 +1867,25 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             date_end,
             sort_by,
             sort_dir,
         ],
         outputs=[results_df],
     )
     def _tokenize_query(q: str) -> List[str]:
-        return [p.strip() for p in re.split(r"\s+", q) if p.strip()][:8]
     def _project_query_to_lsa(q_vec, svd, norm) -> Optional[np.ndarray]:
         try:
             return norm.transform(svd.transform(q_vec)).astype(np.float32)
-        except:
             return None
     def _vectorize_query(q, vec_state, corpus_texts):
         char_min_df = 1 if len(corpus_texts) <= 1 else 2
         if vec_state.get("use_hashing"):
             hv = HashingVectorizer(
                 analyzer="word",
@@ -1775,7 +1893,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 n_features=2 ** vec_state.get("hash_bits", 18),
                 token_pattern=TOKEN_PATTERN,
                 lowercase=True,
             )
             counts = hv.transform(corpus_texts)
             tfidf_tr = TfidfTransformer().fit(counts)
             q_word = tfidf_tr.transform(hv.transform([q]))
@@ -1789,21 +1910,24 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 token_pattern=TOKEN_PATTERN,
                 lowercase=True,
                 stop_words=STOPWORD_FOR_VEC,
             )
             tf = cv.fit_transform(corpus_texts)
             bm25 = BM25Transformer().fit(tf)
             q_word = bm25.transform(cv.transform([q]))
         char_vec = CharTfidf(
-            analyzer="char", ngram_range=(3, 5), min_df=char_min_df, max_features=100_000, lowercase=True
         ).fit(corpus_texts)
         q_char = char_vec.transform([q])
         return hstack([q_word, q_char * 0.20], format="csr")
     def search_fn(q, df, vec, X_red, index, use_lsa, use_faiss, svd, norm, sort, sdir):
         if not q or df is None or vec is None or index is None:
             return pd.DataFrame(), []
         mask = ~df["cluster_id"].isin([-1, -2, -3])
         df_main = df[mask].reset_index(drop=True)
         if df_main.empty:
@@ -1812,7 +1936,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         q_terms = _tokenize_query(q)
         q_vec = _vectorize_query(q, vec, list(df_main.apply(enrich_text, axis=1)))
-        q_emb = _project_query_to_lsa(q_vec, svd, norm) if use_lsa and svd and norm else q_vec
         if q_emb is None:
             return pd.DataFrame(), q_terms
@@ -1821,13 +1945,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             return pd.DataFrame(), q_terms
         if isinstance(index, NearestNeighbors):
             if hasattr(index, "n_samples_fit_") and index.n_samples_fit_ <= 1:
                 return pd.DataFrame(), q_terms
             dists, inds = index.kneighbors(q_emb, n_neighbors=n_req)
             sims = 1.0 - dists[0]
             results = df_main.iloc[inds[0]].copy()
             results["search_score"] = sims
-        elif use_faiss and FAISS_OK and isinstance(index, faiss.Index):
             D, I = index.search(q_emb.astype(np.float32), k=n_req)
             results = df_main.iloc[I[0]].copy()
             results["search_score"] = D[0]
@@ -1854,12 +1979,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         outputs=[results_df, state_query_terms],
     )
     def on_row_select(evt: gr.SelectData, table, df, term_names, q_terms, extra_terms, do_highlight):
         if evt.index is None or table is None or len(table) == 0 or df is None or len(df) == 0:
             return ""
         row_idx = evt.index[0]
         sel = table.iloc[row_idx]
         cand = df[
             (df["subject"] == sel.get("subject"))
             & (df["from_email"] == sel.get("from_email"))
@@ -1874,7 +2001,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         cid = int(row.get("cluster_id", -99))
         clabel = term_names.get(cid, row.get("cluster_name")) if term_names else row.get("cluster_name")
         return build_highlighted_html(
-            row, query_terms=q_terms, cluster_label=clabel, do_highlight=do_highlight, extra_terms=extra_terms
         )
     results_df.select(
@@ -1883,12 +2014,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         outputs=[email_view],
     )
     def on_click_filter(evt: gr.SelectData, df_sum: pd.DataFrame, col_name: str, out_comp: gr.Dropdown):
         if evt.index is None or df_sum is None or df_sum.empty:
             return gr.update()
         val = df_sum.iloc[evt.index[0]][col_name]
         return gr.update(value=val)
     cluster_counts_df.select(
         lambda evt, df: on_click_filter(evt, df, "label", cluster_drop), [cluster_counts_df], [cluster_drop]
     ).then(
@@ -1905,9 +2038,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             date_end,
             sort_by,
             sort_dir,
         ],
         outputs=[results_df],
     )
     domain_counts_df.select(
         lambda evt, df: on_click_filter(evt, df, "from_domain", domain_drop), [domain_counts_df], [domain_drop]
     ).then(
@@ -1924,9 +2060,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             date_end,
             sort_by,
             sort_dir,
         ],
         outputs=[results_df],
     )
     sender_counts_df.select(
         lambda evt, df: on_click_filter(evt, df, "from_email", sender_drop), [sender_counts_df], [sender_drop]
     ).then(
@@ -1943,11 +2082,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             date_end,
             sort_by,
             sort_dir,
         ],
         outputs=[results_df],
     )
 if __name__ == "__main__":
     # Disable SSR to avoid handler arity warnings under server-side rendering
-    demo.launch(ssr_mode=False)

 NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
 ONE_CHAR_RE = re.compile(r"^.$")
+# ---- NEW junk token guards (base64/hex/trackers/very-long) ----
+LONG_ALNUM_RE   = re.compile(r"^[A-Za-z0-9_-]{24,}$")   # long tracking/b64-ish
+HEXISH_RE       = re.compile(r"^(?:[A-Fa-f0-9]{8,})$")  # long hex blobs
+DIGIT_HEAVY_RE  = re.compile(r"^(?:\D*\d){6,}\D*$")     # too many digits
+UNDERSCORE_HEAVY_RE = re.compile(r"^[A-Za-z0-9]*_[A-Za-z0-9_]*$")
 # This stoplist is used by the CountVectorizer (MUST be list for sklearn)
 STOPWORD_FOR_VEC = sorted(EN_STOP | HE_STOP | STOP_TERMS)
 def _is_junk_term(t: str) -> bool:
+    tl = (t or "").strip().lower()
+    if not tl: return True
+    if tl in STOP_TERMS or tl in EN_STOP or tl in HE_STOP or tl in MONTHS: return True
     if EMAIL_LIKE_RE.search(tl): return True
     if YEAR_RE.match(tl): return True
     if NUMERIC_RE.match(tl): return True
     if ONE_CHAR_RE.match(tl): return True
+    if LONG_ALNUM_RE.match(t): return True
+    if HEXISH_RE.match(t): return True
+    if DIGIT_HEAVY_RE.match(t): return True
+    if UNDERSCORE_HEAVY_RE.match(t): return True
+    if len(t) > 40: return True
     return False
 def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
     tokens.append(lang_tok)
     return (t + " " + " ".join(tokens)).strip()
+# =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST (+ coverage ≥30% preference) ===================
+def cluster_labels_pmi_bigram(
+    texts,
+    labels,
+    subjects=None,
+    topn=6,
+    subject_alpha=0.75,
+    global_ubiq_cut=0.20,
+    subject_min_cov=0.30  # NEW: prefer subject terms that appear in ≥30% of a cluster's subjects
+):
     """
     Improved labeler:
       - Considers bigrams AND trigrams (PMI vs. global)
       - Class-TFIDF unigrams with subject coverage boost
       - Suppresses globally ubiquitous tokens/phrases (appear in >20% docs by default)
+      - NEW: prefers subject terms that occur in ≥30% of cluster subjects (can be tuned via subject_min_cov)
     """
     import math as _math
     from collections import Counter, defaultdict
     def is_junk_token(tok: str) -> bool:
         if _is_junk_term(tok): return True
         tl = tok.lower()
+        if tl.startswith("__"): return True               # our feature flags
         if "@" in tl: return True
         if tl.isascii() and len(tl) <= 2: return True
+        if LONG_ALNUM_RE.match(tok) or HEXISH_RE.match(tok) or DIGIT_HEAVY_RE.match(tok): return True
+        if len(tok) > 40: return True
+        # strip punctuation-heavy artifacts (URLs already replaced with 'URL')
+        if re.search(r"[^\w\-’']", tl): return True
         return False
     def tokenize_clean(t):
     for c in sorted(set(int(x) for x in labels)):
         n_docs_c = max(1, per_c_doc_count[c])
+        # ===== PMI bigrams & trigrams with subject-coverage boost & ≥30% preference =====
         phrases = []
         for store, glob_df, subj_docs, n in (
             (per_c_bg[c],  glob_df_bg,  per_c_subj_bg_docs[c], 2),
                     cov = 0.0
                     if have_subjects:
                         cov = subj_docs[ng] / n_docs_c
+                        # add a preference bump if subject coverage ≥ threshold
+                        if cov >= subject_min_cov:
+                            score += 0.6  # modest, ensures such terms bubble up
                     score += subject_alpha * cov
+                    scored.append((score, cov, ng))
+            # prefer subject-coverage ≥ threshold first, then highest score
+            scored.sort(key=lambda x: (x[1] >= subject_min_cov, x[0]), reverse=True)
             take = max(1, topn // (3 if n == 3 else 2))
+            phrases.extend([p for _, _, p in scored[:take]])
+        # ===== Class-TFIDF unigrams with subject coverage boost & ≥30% preference =====
         docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
         docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
         corpus = [docs_c[0], docs_bg[0]]
         vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
         row = X[0].toarray().ravel()
         subj_cov = np.zeros_like(row)
+        subj_cov_frac = np.zeros_like(row)
+        vocab_index = {t:i for i,t in enumerate(vocab)}
         if have_subjects:
             for tok, cnt_docs in per_c_subj_uni_docs[c].items():
                 if tok in vocab_index and not is_junk_token(tok):
+                    i = vocab_index[tok]
+                    frac = cnt_docs / n_docs_c
+                    subj_cov[i] = frac
+                    subj_cov_frac[i] = frac
+        # base + subject alpha
         row_boosted = row + subject_alpha * subj_cov
+        # final score gets a preference bump if subj coverage ≥ threshold (but not a hard filter)
+        pref_bump = (subj_cov_frac >= subject_min_cov).astype(row_boosted.dtype) * 0.6
+        final = row_boosted + pref_bump
+        order = final.argsort()[::-1]
         unis = []
         for i in order:
             tok = vocab[i]
         return seeds_red
     return None
+# =================== NEW: cluster stabilizer (merge near-dupes + reassign tiny → big; else NOISE=-3) ===================
+def _centroids_from_labels(X, labels):
+    labs = np.asarray(labels, dtype=int)
+    uniq = np.unique(labs)
+    cents = {}
+    if isinstance(X, np.ndarray):
+        for c in uniq:
+            idx = (labs == c)
+            if not np.any(idx): continue
+            v = X[idx].mean(axis=0)
+            n = np.linalg.norm(v)
+            if n > 0: v = v / n
+            cents[int(c)] = v.astype(np.float32)
+        return cents
+    # CSR sparse
+    X = X.tocsr()
+    for c in uniq:
+        rows = np.where(labs == c)[0]
+        if rows.size == 0: continue
+        sub = X[rows]
+        v = np.asarray(sub.mean(axis=0)).ravel()
+        n = np.linalg.norm(v)
+        if n > 0: v = v / n
+        cents[int(c)] = v.astype(np.float32)
+    return cents
+def _cosine_sim_to_centroids(vecs, centroids):
+    if not centroids:
+        return None, None
+    keys = list(centroids.keys())
+    C = np.stack([centroids[k] for k in keys], axis=0)  # (k,d)
+    if isinstance(vecs, np.ndarray):
+        sims = vecs @ C.T
+    else:
+        sims = vecs.dot(C.T)
+        if hasattr(sims, "toarray"): sims = sims.toarray()
+    best_idx = np.argmax(sims, axis=1)
+    best_lab = np.array([keys[i] for i in best_idx], dtype=int)
+    best_sim = sims[np.arange(sims.shape[0]), best_idx]
+    return best_lab, best_sim
+def stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35):
+    labs = np.asarray(labels, dtype=int)
+    # 1) merge very close centroids
+    cents = _centroids_from_labels(X_space, labs)
+    keys = sorted([k for k in cents.keys() if k >= 0])
+    if len(keys) >= 2:
+        C = np.stack([cents[k] for k in keys], axis=0)
+        sims = C @ C.T
+        parent = {k:k for k in keys}
+        def find(a):
+            while parent[a]!=a:
+                a = parent[a]
+            return a
+        for i in range(len(keys)):
+            for j in range(i+1, len(keys)):
+                if sims[i,j] >= float(merge_thresh):
+                    ri, rj = find(keys[i]), find(keys[j])
+                    if ri != rj:
+                        parent[rj] = ri
+        root = {k: find(k) for k in keys}
+        merge_map = {k: root[k] for k in keys}
+        labs = np.array([merge_map.get(int(c), int(c)) for c in labs], dtype=int)
+        cents = _centroids_from_labels(X_space, labs)
+    # 2) reassign tiny clusters to nearest big centroid (else noise -3)
+    vc = pd.Series(labs).value_counts()
+    big_labs = set(vc[vc >= int(min_size)].index.tolist())
+    small_labs = set(vc[vc < int(min_size)].index.tolist())
+    big_cents = {c: cents[c] for c in big_labs if c in cents and c >= 0}
+    NOISE_ID = -3
+    if small_labs and big_cents:
+        idx_small = np.where(pd.Series(labs).isin(small_labs))[0]
+        if idx_small.size > 0:
+            sub = X_space[idx_small] if not isinstance(X_space, np.ndarray) else X_space[idx_small]
+            best_lab, best_sim = _cosine_sim_to_centroids(sub, big_cents)
+            reassigned = np.where(best_sim >= float(reassign_thresh), best_lab, NOISE_ID)
+            labs[idx_small] = reassigned
+    return labs
 # =================== Scoring & Flags ===================
 def _hour_of(dt_iso: str) -> Optional[int]:
     try:
         if not dt_iso: return None
         dt = pd.to_datetime(dt_iso, utc=True, errors="coerce")
         if pd.isna(dt): return None
         return int(dt.hour)
     except Exception:
         return None
     body_len = len(row.get("body_text",""))
     if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
         score += 0.5
     fd = (row.get("from_domain") or "").lower()
     if fd in PERSONAL_DOMAINS and fd not in trusted_domains:
         score += 0.5
     h = _hour_of(row.get("date") or "")
     if h is not None and (h < 6 or h > 22):
         score += 0.3
             date_end   = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
             sort_by    = gr.Dropdown(label="Sort by", choices=["corruption_score","date","anomaly_score","search_score"], value="corruption_score")
             sort_dir   = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
+        # NEW: hide noise toggle
+        hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
     with gr.Row():
         run_btn = gr.Button("Process", variant="primary")
         tag_value: str,
         start: str,
         end: str,
+        hide_noise_flag: bool = False,  # NEW
     ) -> pd.DataFrame:
         out = df
         if cluster and cluster != "(any)":
                 out = out[pd.to_datetime(out["date"], utc=True, errors="coerce") <= dt]
             except Exception:
                 pass
+        if hide_noise_flag:
+            out = out[out["cluster_id"] != -3]
         return out
     # -------- Simple social network stats --------
     def social_stats(df: pd.DataFrame) -> pd.DataFrame:
         deg = {}
         def add_edge(a,b):
             if not a or not b or a==b: return
         # === Vectorization & Clustering (UPGRADED) ===
         def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
             texts = list(df_in.apply(enrich_text, axis=1))
             subjects_only = list(df_in["subject"].fillna(""))
             return texts, subjects_only
                 count_vec = None; bm25 = None
                 return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full
             count_vec = CountVectorizer(
                 analyzer="word", ngram_range=ngram_range,
                 max_features=int(max_features) if max_features else None,
             n_docs = X_full.shape[0]
             n_feats = X_full.shape[1]
             max_components = max(1, min(n_docs, n_feats) - 1)
             n_comp = int(min(int(lsa_dim or 256), max_components))
             if n_comp < 2:
                 return None, None, None
             svd_obj = TruncatedSVD(n_components=n_comp, random_state=0)
             return X_reduced, svd_obj, norm_obj
         def _attach_embeddings(texts, X_reduced_or_full, use_lsa, kv, emb_dim, weight):
             if kv is None or emb_dim <= 0 or weight <= 0.0:
                 return X_reduced_or_full, emb_dim
             doc_embs = _build_doc_embeddings(texts, kv, emb_dim).astype(np.float32)
             norm_obj,
             d_word, d_char
         ):
             n = X_space.shape[0]
             if n <= 1:
                 labels = np.zeros((n,), dtype=int) if n == 1 else np.array([], dtype=int)
                 centers = None
                 chosen_k = int(n) if n > 0 else 0
                 return labels, centers, chosen_k
             if n < 10:
                 k_small = min(max(2, n // 2), n)
                 kmeans = MiniBatchKMeans(
                     n_clusters=int(k_small),
                 chosen_k = int(len(set([l for l in labels if l >= 0])))
                 return labels, centers, chosen_k
             if bool(auto_k):
                 if use_lsa and isinstance(X_space, np.ndarray):
                     k, _ = choose_k_by_kneedle(X_space, ks=(50, 100, 150, 200, 300, 400, 500))
             chosen_k = int(len(set(labels)))
             return labels, centers, chosen_k
         trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
         extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
         extra_terms_lower = [t.lower() for t in extra_terms]
         if bool(use_embeddings):
             kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
         parts = []
         if bool(per_language) and "lang" in df_main.columns:
             for lang_code, grp in df_main.groupby("lang", dropna=False):
                 d_word=d_word,
                 d_char=d_char,
             )
+            # NEW: stabilize per partition
+            labels = stabilize_labels(
+                X_space, labels,
+                min_size=40,
+                merge_thresh=0.96,
+                reassign_thresh=0.35,
+            )
+            k_agg += len(set(labels))
             term_names = cluster_labels_pmi_bigram(
+                texts=texts, labels=labels, subjects=subjects_only,
+                topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
             )
             term_names_global.update({int(k): v for k, v in term_names.items()})
             labels_list.append(pd.Series(labels, index=df_part.index))
             cluster_name_list.append(
                 pd.Series(
+                    [term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels],
                     index=df_part.index,
                 )
             )
             df_main["cluster_name"] = "unclustered"
             df_main["anomaly_score"] = np.nan
         if len(df_news):
             df_news.loc[:, "cluster_id"] = -1
             df_news.loc[:, "cluster_name"] = "newsletter/news"
             except Exception:
                 pass
         cluster_counts = (
             df.groupby(["cluster_id", "cluster_name"])
             .size()
             "max_df": float(max_df),
             "use_bigrams": bool(use_bigrams),
         }
+        status_md = f"**Processed {len(df):,} emails** | clusters ~ {len(cluster_counts):,} (showing top 500)"
         svd_obj_out = svd_obj_local if single_partition else None
         norm_obj_out = norm_obj_local if single_partition else None
         return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
+    def refresh_results(df, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
         if df is None or len(df) == 0:
             return pd.DataFrame()
+        filt = _apply_filters(
+            df, cluster, domain, sender, lang, sentiment, tag, start, end, hide_noise_flag=bool(hide_noise_flag)
+        )
         return _sort_results(filt, sort_by, sort_dir)
+    # Re-run when any filter control changes (including hide_noise)
+    for ctrl in [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
+                 date_start, date_end, sort_by, sort_dir, hide_noise]:
         ctrl.change(
             refresh_results,
             inputs=[
                 date_end,
                 sort_by,
                 sort_dir,
+                hide_noise,
             ],
             outputs=[results_df],
         )
+    # Reset filters (sets selects to (any), dates blank, sort default, and hide_noise= True)
     reset_btn.click(
+        lambda: ["(any)"] * 6 + [""] * 2 + ["corruption_score", "desc"] + [True],
         [],
+        [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
+         date_start, date_end, sort_by, sort_dir, hide_noise],
     ).then(
         refresh_results,
         inputs=[
             date_end,
             sort_by,
             sort_dir,
+            hide_noise,
         ],
         outputs=[results_df],
     )
+    # -------- Search helpers --------
     def _tokenize_query(q: str) -> List[str]:
+        return [p.strip() for p in re.split(r"\s+", q or "") if p.strip()][:8]
     def _project_query_to_lsa(q_vec, svd, norm) -> Optional[np.ndarray]:
         try:
             return norm.transform(svd.transform(q_vec)).astype(np.float32)
+        except Exception:
             return None
     def _vectorize_query(q, vec_state, corpus_texts):
+        # Build the same features for the query that we used for docs
         char_min_df = 1 if len(corpus_texts) <= 1 else 2
         if vec_state.get("use_hashing"):
             hv = HashingVectorizer(
                 analyzer="word",
                 n_features=2 ** vec_state.get("hash_bits", 18),
                 token_pattern=TOKEN_PATTERN,
                 lowercase=True,
+                norm=None,
+                alternate_sign=False,
             )
+            # Fit TF-IDF weights from corpus
             counts = hv.transform(corpus_texts)
             tfidf_tr = TfidfTransformer().fit(counts)
             q_word = tfidf_tr.transform(hv.transform([q]))
                 token_pattern=TOKEN_PATTERN,
                 lowercase=True,
                 stop_words=STOPWORD_FOR_VEC,
+                dtype=np.float32,
             )
             tf = cv.fit_transform(corpus_texts)
             bm25 = BM25Transformer().fit(tf)
             q_word = bm25.transform(cv.transform([q]))
         char_vec = CharTfidf(
+            analyzer="char", ngram_range=(3, 5), min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32
         ).fit(corpus_texts)
         q_char = char_vec.transform([q])
         return hstack([q_word, q_char * 0.20], format="csr")
     def search_fn(q, df, vec, X_red, index, use_lsa, use_faiss, svd, norm, sort, sdir):
         if not q or df is None or vec is None or index is None:
             return pd.DataFrame(), []
+        # Search ignores newsletters/alerts/noise by default
         mask = ~df["cluster_id"].isin([-1, -2, -3])
         df_main = df[mask].reset_index(drop=True)
         if df_main.empty:
         q_terms = _tokenize_query(q)
         q_vec = _vectorize_query(q, vec, list(df_main.apply(enrich_text, axis=1)))
+        q_emb = _project_query_to_lsa(q_vec, svd, norm) if use_lsa and svd is not None and norm is not None else q_vec
         if q_emb is None:
             return pd.DataFrame(), q_terms
             return pd.DataFrame(), q_terms
         if isinstance(index, NearestNeighbors):
+            # brute-force cosine on reduced space
             if hasattr(index, "n_samples_fit_") and index.n_samples_fit_ <= 1:
                 return pd.DataFrame(), q_terms
             dists, inds = index.kneighbors(q_emb, n_neighbors=n_req)
             sims = 1.0 - dists[0]
             results = df_main.iloc[inds[0]].copy()
             results["search_score"] = sims
+        elif use_faiss and FAISS_OK and hasattr(index, "search"):
             D, I = index.search(q_emb.astype(np.float32), k=n_req)
             results = df_main.iloc[I[0]].copy()
             results["search_score"] = D[0]
         outputs=[results_df, state_query_terms],
     )
+    # -------- Reader selection (build highlighted HTML) --------
     def on_row_select(evt: gr.SelectData, table, df, term_names, q_terms, extra_terms, do_highlight):
         if evt.index is None or table is None or len(table) == 0 or df is None or len(df) == 0:
             return ""
         row_idx = evt.index[0]
         sel = table.iloc[row_idx]
+        # Try to match the original row
         cand = df[
             (df["subject"] == sel.get("subject"))
             & (df["from_email"] == sel.get("from_email"))
         cid = int(row.get("cluster_id", -99))
         clabel = term_names.get(cid, row.get("cluster_name")) if term_names else row.get("cluster_name")
         return build_highlighted_html(
+            row,
+            query_terms=q_terms,
+            cluster_label=clabel,
+            do_highlight=do_highlight,
+            extra_terms=extra_terms,
         )
     results_df.select(
         outputs=[email_view],
     )
+    # Click-to-filter conveniences for summary tables
     def on_click_filter(evt: gr.SelectData, df_sum: pd.DataFrame, col_name: str, out_comp: gr.Dropdown):
         if evt.index is None or df_sum is None or df_sum.empty:
             return gr.update()
         val = df_sum.iloc[evt.index[0]][col_name]
         return gr.update(value=val)
+    # Cluster summary → set cluster filter
     cluster_counts_df.select(
         lambda evt, df: on_click_filter(evt, df, "label", cluster_drop), [cluster_counts_df], [cluster_drop]
     ).then(
             date_end,
             sort_by,
             sort_dir,
+            hide_noise,
         ],
         outputs=[results_df],
     )
+    # Domain summary → set domain filter
     domain_counts_df.select(
         lambda evt, df: on_click_filter(evt, df, "from_domain", domain_drop), [domain_counts_df], [domain_drop]
     ).then(
             date_end,
             sort_by,
             sort_dir,
+            hide_noise,
         ],
         outputs=[results_df],
     )
+    # Sender summary → set sender filter
     sender_counts_df.select(
         lambda evt, df: on_click_filter(evt, df, "from_email", sender_drop), [sender_counts_df], [sender_drop]
     ).then(
             date_end,
             sort_by,
             sort_dir,
+            hide_noise,
         ],
         outputs=[results_df],
     )
 if __name__ == "__main__":
     # Disable SSR to avoid handler arity warnings under server-side rendering
+    demo.launch(ssr_mode=False)