Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Sep 1, 2025

Commit

609d65e

verified ·

1 Parent(s): e12524d

Update app.py

Browse files

Files changed (1) hide show

app.py +609 -222

app.py CHANGED Viewed

@@ -25,6 +25,22 @@ from sklearn.preprocessing import Normalizer
 from sklearn.preprocessing import normalize as sk_normalize
 from sklearn.metrics.pairwise import cosine_similarity
 # Optional light anomaly detection
 try:
     from sklearn.ensemble import IsolationForest
@@ -571,6 +587,62 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
     )
     return html
 # =================== Feature engineering (BM25 + char) ===================
 class BM25Transformer:
     def __init__(self, k1=1.2, b=0.75):
@@ -615,96 +687,117 @@ def enrich_text(row: pd.Series) -> str:
     return (t + " " + " ".join(tokens)).strip()
 # =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST ===================
-def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alpha=0.75):
     """
-    Create human-readable labels per cluster using:
-      1) PMI bigrams (cluster vs global)   + subject coverage boost
-      2) Class-TFIDF unigrams (cluster vs rest) + subject coverage boost
-    `subjects`: list of subject strings aligned with `texts`
-    `subject_alpha`: weight added per token = alpha * coverage_in_subjects (0..1)
     """
     import math as _math
     from collections import Counter, defaultdict
     from sklearn.feature_extraction.text import TfidfVectorizer
-    HEADER_STOP = {"subject","re","fw","fwd","to","cc","bcc","from","sent","forwarded","回复","主题","收件人","发件人"}
     def is_junk_token(tok: str) -> bool:
         if _is_junk_term(tok): return True
         tl = tok.lower()
-        if tl.startswith("__"): return True          # hide __LANG/__HAS in labels
-        if tl in STOP_TERMS: return True             # extra HTML/MIME junk
         if tl in HEADER_STOP: return True
         if "@" in tl: return True
-        # drop short ASCII like "eb/ys/yl"
         if tl.isascii() and len(tl) <= 2: return True
-        # punctuation blobs (keep apostrophes)
-        if re.search(r"[^\w\-']", tl):
-            if "’" not in tl and "'" not in tl:
-                return True
         return False
     def tokenize_clean(t):
-        toks = re.findall(TOKEN_PATTERN, t.lower())
         return [w for w in toks if not is_junk_token(w)]
-    def bigrams(toks):
-        return [" ".join(p) for p in zip(toks, toks[1:]) if all(not is_junk_token(x) for x in p)]
-    glob_bg = Counter()
     per_c_bg = defaultdict(Counter)
     per_c_texts = defaultdict(list)
     per_c_doc_count = defaultdict(int)
-    # SUBJECT presence (unique tokens/bigrams per subject per doc)
     per_c_subj_uni_docs = defaultdict(Counter)
     per_c_subj_bg_docs  = defaultdict(Counter)
     have_subjects = subjects is not None and len(subjects) == len(texts)
     for idx, (txt, c) in enumerate(zip(texts, labels)):
         c = int(c)
         toks = tokenize_clean(txt)
-        bgs = set(bigrams(toks))
-        glob_bg.update(bgs)
-        per_c_bg[c].update(bgs)
         per_c_texts[c].append(" ".join(toks))
         per_c_doc_count[c] += 1
         if have_subjects:
-            subj_toks = tokenize_clean(subjects[idx] or "")
-            subj_uni_set = set(subj_toks)
-            subj_bg_set  = set(bigrams(subj_toks))
-            per_c_subj_uni_docs[c].update(subj_uni_set)
-            per_c_subj_bg_docs[c].update(subj_bg_set)
     labels_out = {}
-    total_bg = sum(glob_bg.values()) + 1e-12
     for c in sorted(set(int(x) for x in labels)):
         n_docs_c = max(1, per_c_doc_count[c])
-        # PMI bigrams (+ subject boost)
-        scores = []
-        total_c = sum(per_c_bg[c].values()) + 1e-12
-        for bg, cnt in per_c_bg[c].most_common(2000):
-            p_bg_c = cnt / total_c
-            p_bg   = (glob_bg[bg] / total_bg)
-            if p_bg > 0 and p_bg_c > 0:
-                score = _math.log(p_bg_c) - _math.log(p_bg)
-                # subject coverage boost: fraction of cluster docs whose SUBJECT contains this bigram
-                cov = 0.0
-                if have_subjects:
-                    cov = per_c_subj_bg_docs[c][bg] / n_docs_c
-                score = score + subject_alpha * cov
-                scores.append((score, bg))
-        scores.sort(reverse=True)
-        top_bi = [bg for _, bg in scores[: max(2, topn//2) ]]
-        # class-TFIDF unigrams (cluster doc vs. background doc) + subject boost
         docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
-        docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k!=c), [])) or " "]
         corpus = [docs_c[0], docs_bg[0]]
         vec = TfidfVectorizer(
             analyzer="word", ngram_range=(1,1),
@@ -714,30 +807,27 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
         vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
         row = X[0].toarray().ravel()
-        # Build subject coverage vector over this vocab
         subj_cov = np.zeros_like(row)
         if have_subjects:
             vocab_index = {t:i for i,t in enumerate(vocab)}
             for tok, cnt_docs in per_c_subj_uni_docs[c].items():
-                if tok in vocab_index:
-                    subj_cov[vocab_index[tok]] = cnt_docs / n_docs_c  # 0..1
-        # Apply boost (only to non-junk tokens)
-        row_boosted = row.copy()
-        for i, tok in enumerate(vocab):
-            if subj_cov[i] > 0 and not is_junk_token(tok):
-                row_boosted[i] = row[i] + subject_alpha * float(subj_cov[i])
-        top_idx = row_boosted.argsort()[::-1][: max(0, topn - len(top_bi)) ]
-        top_uni = []
-        for i in top_idx:
             tok = vocab[i]
-            if not is_junk_token(tok):
-                top_uni.append(tok)
-            if len(top_uni) >= max(0, topn - len(top_bi)):
                 break
-        parts = top_bi + top_uni
         labels_out[c] = ", ".join(parts) if parts else f"cluster_{c}"
     return labels_out
@@ -898,12 +988,21 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
             use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
             skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
         with gr.Row():
             use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
             lsa_dim = gr.Number(label="LSA components", value=256, precision=0)  # richer default
             auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
             k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
             mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
         with gr.Row():
             use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
             use_iso = gr.Checkbox(label="Compute anomaly score (IsolationForest on LSA)", value=False)
@@ -913,6 +1012,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             trusted_domains_in = gr.Textbox(label="Trusted org domains (comma-separated)", value="example.gov, example.org")
             extra_keywords_in  = gr.Textbox(label="Extra suspicious phrases (comma-separated)", value="")
             highlight_toggle   = gr.Checkbox(label="Highlight suspect patterns in reader", value=True)
         with gr.Row():
             cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
             domain_drop  = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
@@ -934,6 +1039,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     with gr.Row():
         cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
         domain_counts_df  = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
     with gr.Row():
         actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
@@ -1003,7 +1111,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     ) -> pd.DataFrame:
         out = df
         if cluster and cluster != "(any)":
-            m = re.match(r"^(\d+)\s+—", cluster)
             if m:
                 cid = int(m.group(1))
                 out = out[out["cluster_id"] == cid]
@@ -1054,13 +1162,182 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     # -------- Main pipeline --------
     def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
                      use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
-                     trusted_domains_in, extra_keywords_in, highlight_toggle):
         if inbox_file is None:
             return ("**Please upload a file.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
                     None, None, None, None)
-        use_lang = not bool(skip_lang)
         # trusted org domains
         trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
@@ -1071,19 +1348,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         recs = _load_json_records(inbox_file.name)
         if not recs:
             return ("**No valid records found.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
                     None, None, None, None)
         # Normalize
         normd = []
         for r in tqdm(recs, desc="Normalize", leave=False):
-            out = normalize_email_record(r, use_langdetect=use_lang)
             if out and out.get("body_text") is not None:
                 normd.append(out)
         df = pd.DataFrame(normd)
         if df.empty:
             return ("**No usable email records after normalization.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
                     None, None, None, None)
         # Deduplicate conservatively
@@ -1116,135 +1393,150 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         df_news   = df[df["is_news"]].reset_index(drop=True)
         df_alerts = df[df["is_notify"]].reset_index(drop=True)
-        # Enriched texts (adds __HAS_*__ flags + __LANG__)
-        texts = list(df_main.apply(enrich_text, axis=1))
-        subjects_only = list(df_main["subject"].fillna(""))
-        # === Vectorization ===
-        ngram_range = (1, 2) if use_bigrams else (1, 1)
-        count_vec = CountVectorizer(
-            analyzer="word",
-            ngram_range=ngram_range,
-            max_features=int(max_features) if max_features else None,
-            min_df=int(min_df) if min_df else 2,
-            max_df=float(max_df) if max_df else 0.7,
-            token_pattern=TOKEN_PATTERN,
-            lowercase=True,
-            dtype=np.float32,
-            stop_words=STOPWORD_FOR_VEC,   # list (not set)
-        )
-        TF = count_vec.fit_transform(texts)
-        bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
-        X_word = bm25.transform(TF)  # sparse BM25 word matrix
-        char_vec = CharTfidf(
-            analyzer="char", ngram_range=(3,5), min_df=2, max_features=100_000,
-            lowercase=True, dtype=np.float32
-        )
-        X_char = char_vec.fit_transform(texts)
-        # Down-weight char-grams so they don't dominate geometry
-        X_full = hstack([X_word, X_char * 0.20], format="csr")
-        d_word = X_word.shape[1]
-        d_char = X_char.shape[1]
-        d_full = X_full.shape[1]
-        # LSA
-        use_lsa = bool(use_lsa)
-        X_reduced = None
-        svd_obj = None
-        norm_obj = None
-        if use_lsa:
-            svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
-            X_reduced_tmp = svd_obj.fit_transform(X_full)  # dense
-            norm_obj = Normalizer(copy=False)
-            X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
-            del X_reduced_tmp
-            gc.collect()
-        # Optional anomaly detection (on LSA space)
-        anomaly_scores = np.full((len(df_main),), np.nan, dtype=np.float32)
-        if use_lsa and bool(use_iso) and ISO_OK and X_reduced is not None and X_reduced.shape[0] >= 50:
-            try:
-                iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
-                iso.fit(X_reduced)
-                anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32)  # higher = more anomalous
-            except Exception:
-                pass
-        # K selection
-        if bool(auto_k):
-            if use_lsa:
-                k, _ = choose_k_by_kneedle(X_reduced, ks=(50,100,150,200,300,400,500))
-            else:
-                k = auto_k_rule(X_full.shape[0])
         else:
-            k = max(10, int(k_clusters or 350))
-        # Optional seeded init (only in LSA space)
-        init = None
-        if use_lsa:
-            seeds = seeded_centroids_in_lsa(
-                CORR_LEX, count_vec, svd_obj.components_, norm_obj,
-                d_word=d_word, d_full=d_full, k=k
             )
-            if seeds is not None and seeds.shape[0] == k:
-                init = seeds
-        # KMeans clustering (use LSA space if enabled)
-        X_space = (X_reduced if use_lsa else X_full)
-        kmeans = MiniBatchKMeans(
-            n_clusters=k,
-            batch_size=int(mb_batch or 4096),
-            random_state=0,
-            n_init="auto" if init is None else 1,
-            init="k-means++" if init is None else init
-        )
-        labels = kmeans.fit_predict(X_space)
-        # Merge very-similar clusters (LSA only) — stricter
-        if use_lsa:
-            labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.95)
-        # Attach clustering back to df_main
-        df_main["cluster_id"] = labels
-        term_names = cluster_labels_pmi_bigram(texts, labels, subjects=subjects_only, topn=6, subject_alpha=0.75)
-        df_main["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
-        df_main["anomaly_score"] = anomaly_scores
-        # Newsletter/newswire rows: assign special cluster
-        if len(df_news):
-            df_news["cluster_id"] = -1
-            df_news["cluster_name"] = "newsletter/news"
-            df_news["anomaly_score"] = np.nan
-        # System/notification rows: assign special cluster
         if len(df_alerts):
-            df_alerts["cluster_id"] = -2
-            df_alerts["cluster_name"] = "system/alerts"
-            df_alerts["anomaly_score"] = np.nan
         # Combine back
         df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
-        # CorruptionScore (uses trusted domains)
         df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
-        # Build search index on df_main only
-        use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
         index_obj = None
         if use_faiss:
-            d = X_reduced.shape[1]
-            index_obj = faiss.IndexFlatIP(d)  # cosine ~ inner product on normalized vectors
-            index_obj.add(X_reduced)
         else:
-            nn = NearestNeighbors(metric="cosine", algorithm="brute")
-            nn.fit(X_space)
-            index_obj = nn
         # Summaries
         cluster_counts = (
-            df[(df["cluster_id"] != -1) & (df["cluster_id"] != -2)]
               .groupby(["cluster_id", "cluster_name"]).size()
               .reset_index(name="count")
               .sort_values("count", ascending=False)
@@ -1254,11 +1546,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         # Append buckets
         news_count   = int((df["cluster_id"] == -1).sum())
         alerts_count = int((df["cluster_id"] == -2).sum())
         extra_rows = []
         if news_count > 0:
             extra_rows.append({"cluster_id": -1, "cluster_name": "newsletter/news", "count": news_count})
         if alerts_count > 0:
             extra_rows.append({"cluster_id": -2, "cluster_name": "system/alerts", "count": alerts_count})
         if extra_rows:
             cluster_counts = pd.concat([cluster_counts, pd.DataFrame(extra_rows)], ignore_index=True)
@@ -1283,6 +1578,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         )
         sender_choices = ["(any)"] + sender_counts["from_email"].tolist()
         # Languages present
         langs = [l for l in sorted(df["lang"].dropna().unique()) if l and l!="unknown"]
         lang_choices = ["(any)"] + langs
@@ -1304,13 +1600,21 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
         out_table = show_df[cols_out].head(500)
-        vec_state = {"count_vec": count_vec, "char_vec": char_vec, "bm25": bm25}
         status_md = (
             f"**Processed {len(df):,} emails**  \n"
-            f"Word feats (BM25): {d_word:,}  |  Char feats: {d_char:,} (x0.20)  |  Total: {d_full:,}  \n"
-            f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims  |  ' if use_lsa else ''}"
-            f"k = {k}  |  Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'}  |  "
             f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
         )
@@ -1320,27 +1624,36 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         domain_update  = gr.update(choices=domain_choices,  value="(any)")
         sender_update  = gr.update(choices=sender_choices,  value="(any)")
         lang_update    = gr.update(choices=lang_choices,    value="(any)")
         return (
             status_md,
-            cluster_counts, domain_counts,
             actors, offhours_table,
             out_table,
-            df, vec_state, (X_reduced if use_lsa else None), index_obj, term_names,
-            use_lsa, bool(use_faiss),
             cluster_update, domain_update, sender_update, lang_update,
-            svd_obj, norm_obj,
-            (d_word, d_char),
             extra_terms_lower, bool(highlight_toggle)
         )
     (run_btn.click)(
         process_file,
-        inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
-                use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
-                trusted_domains_in, extra_keywords_in, highlight_toggle],
         outputs=[status,
-                 cluster_counts_df, domain_counts_df,
                  actors_df, offhours_df,
                  results_df,
                  state_df, state_vec, state_X_reduced, state_index, state_term_names,
@@ -1371,6 +1684,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
         tmp = tmp.drop(columns=["_dt"])
         cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
         acc = [c for c in cols_out if c in tmp.columns]
         return tmp[acc].head(500)
@@ -1416,40 +1731,75 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         except Exception:
             return None
-    def _vectorize_query(q: str, vec_state: Dict[str, Any]):
-        count_vec = vec_state["count_vec"]
-        char_vec  = vec_state["char_vec"]
-        bm25      = vec_state["bm25"]
-        q_word_tf = count_vec.transform([q])
-        q_word    = bm25.transform(q_word_tf)
-        q_char    = char_vec.transform([q])
-        q_full    = hstack([q_word, q_char], format="csr")
         return q_full
     def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
         if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
             return pd.DataFrame(), []
         q_terms = _tokenize_query(q)
-        q_vec_full = _vectorize_query(q, vec_state)
-        if use_lsa_flag and (X_reduced is not None):
             q_emb = _project_query_to_lsa(q_vec_full, svd_obj, norm_obj)
             if q_emb is None:
                 return pd.DataFrame(), q_terms
         else:
             q_emb = q_vec_full
-        # align with df_main order (exclude -1 and -2)
-        mask = ~df["cluster_id"].isin([-1, -2])
-        filtered_df = df[mask]
         if isinstance(index_obj, NearestNeighbors):
-            distances, indices = index_obj.kneighbors(q_emb, n_neighbors=min(50, len(filtered_df)))
             inds = indices[0]
             sims = 1.0 - distances[0]
             results = filtered_df.iloc[inds].copy()
             results["search_score"] = sims
         elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
-            D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(filtered_df)))
             inds = I[0]
             sims = D[0]
             results = filtered_df.iloc[inds].copy()
@@ -1459,8 +1809,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         # blend with corruption score lightly
         cs = results["corruption_score"].fillna(0.0)
-        cs = (cs - cs.min()) / (cs.max() - cs.min() + 1e-9)
-        results["_blend"] = 0.7*results["search_score"].values + 0.3*cs.values
         # sort UI-selected way
         if sort_by == "search_score":
             results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
@@ -1471,7 +1821,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 results = results.sort_values("_blend", ascending=(sort_dir=="asc"))
         results = results.drop(columns=["_blend"])
         cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score", "search_score"]
-        return results[cols].head(50), q_terms
     search_btn.click(
         search_fn,
@@ -1499,12 +1849,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         if dstr is not None:
             cand = cand[cand["date"] == dstr]
         if len(cand) == 0:
             cand = df[df["subject"] == sel.get("subject", "")]
         if len(cand) == 0:
-            return ""
         row = cand.iloc[0]
-        cid = int(row.get("cluster_id", -1))
-        clabel = term_names.get(cid, f"cluster_{cid}") if term_names else None
         return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel, do_highlight=bool(do_highlight), extra_terms=extra_terms)
     results_df.select(
@@ -1523,6 +1874,17 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             return "(any)"
         label = df_sum.iloc[row_idx]["label"]
         return label if isinstance(label, str) else "(any)"
     cluster_counts_df.select(
         on_cluster_click,
@@ -1533,6 +1895,31 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
         outputs=[results_df]
     )
 if __name__ == "__main__":
-    demo.launch()

 from sklearn.preprocessing import normalize as sk_normalize
 from sklearn.metrics.pairwise import cosine_similarity
+# === NEW / UPDATED IMPORTS ===
+from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer  # NEW
+from scipy.sparse import csr_matrix  # NEW (to mix dense embeddings with sparse)
+try:
+    import hdbscan  # OPTIONAL (pip install hdbscan)
+    HDBSCAN_OK = True
+except Exception:
+    HDBSCAN_OK = False
+# Optional tiny/fast word vectors via Gensim (local .txt/.vec/.bin)
+try:
+    from gensim.models import KeyedVectors  # OPTIONAL
+    GENSIM_OK = True
+except Exception:
+    GENSIM_OK = False
 # Optional light anomaly detection
 try:
     from sklearn.ensemble import IsolationForest
     )
     return html
+# ---------- Lightweight Embedding Utilities (Optional) ----------
+def _load_embeddings(emb_path: str, binary: bool):
+    """
+    Load word vectors with Gensim if available.
+    Accepts word2vec binary (.bin) or text formats (.txt/.vec).
+    Returns (model, dim) or (None, 0) if not available.
+    """
+    if not GENSIM_OK or not emb_path or not os.path.exists(emb_path):
+        return None, 0
+    try:
+        if binary:
+            kv = KeyedVectors.load_word2vec_format(emb_path, binary=True)
+        else:
+            kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=False)
+        return kv, int(kv.vector_size)
+    except Exception:
+        # Attempt GloVe-like with headerless text
+        try:
+            kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=True)
+            return kv, int(kv.vector_size)
+        except Exception:
+            return None, 0
+def _avg_embed_for_text(text: str, kv, dim: int) -> np.ndarray:
+    """
+    Average embeddings over tokens matched by TOKEN_PATTERN.
+    Returns zero vector if nothing matches or kv is None.
+    """
+    vec = np.zeros((dim,), dtype=np.float32)
+    if not kv or not text:
+        return vec
+    toks = re.findall(TOKEN_PATTERN, text.lower())
+    cnt = 0
+    for t in toks:
+        if t in kv:
+            vec += kv[t]
+            cnt += 1
+    if cnt > 0:
+        vec /= float(cnt)
+        # L2-normalize
+        n = np.linalg.norm(vec)
+        if n > 0:
+            vec /= n
+    return vec
+def _build_doc_embeddings(texts: List[str], kv, dim: int) -> np.ndarray:
+    """
+    Build [n_docs, dim] dense matrix of averaged embeddings.
+    """
+    if not kv or dim <= 0:
+        return np.zeros((len(texts), 0), dtype=np.float32)
+    out = np.zeros((len(texts), dim), dtype=np.float32)
+    for i, t in enumerate(texts):
+        out[i, :] = _avg_embed_for_text(t or "", kv, dim)
+    return out
 # =================== Feature engineering (BM25 + char) ===================
 class BM25Transformer:
     def __init__(self, k1=1.2, b=0.75):
     return (t + " " + " ".join(tokens)).strip()
 # =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST ===================
+def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alpha=0.75, global_ubiq_cut=0.20):
     """
+    Improved labeler:
+      - Considers bigrams AND trigrams (PMI vs. global)
+      - Class-TFIDF unigrams with subject coverage boost
+      - Suppresses globally ubiquitous tokens/phrases (appear in >20% docs by default)
     """
     import math as _math
     from collections import Counter, defaultdict
     from sklearn.feature_extraction.text import TfidfVectorizer
+    HEADER_STOP = {"subject","re","fw","fwd","to","cc","bcc","from","sent","forwarded",
+                   "回复","主题","收件人","发件人"}
     def is_junk_token(tok: str) -> bool:
         if _is_junk_term(tok): return True
         tl = tok.lower()
+        if tl.startswith("__"): return True
+        if tl in STOP_TERMS: return True
         if tl in HEADER_STOP: return True
         if "@" in tl: return True
         if tl.isascii() and len(tl) <= 2: return True
+        if re.search(r"[^\w\-']", tl) and "’" not in tl and "'" not in tl: return True
         return False
     def tokenize_clean(t):
+        toks = re.findall(TOKEN_PATTERN, (t or "").lower())
         return [w for w in toks if not is_junk_token(w)]
+    def ngrams(toks, n):
+        return [" ".join(p) for p in zip(*[toks[i:] for i in range(n)]) if all(not is_junk_token(x) for x in p)]
+    # Compute global doc frequency for tokens, bigrams, trigrams
+    glob_df_uni = Counter()
+    glob_df_bg  = Counter()
+    glob_df_tri = Counter()
     per_c_bg = defaultdict(Counter)
+    per_c_tri = defaultdict(Counter)
     per_c_texts = defaultdict(list)
     per_c_doc_count = defaultdict(int)
     per_c_subj_uni_docs = defaultdict(Counter)
     per_c_subj_bg_docs  = defaultdict(Counter)
+    per_c_subj_tri_docs = defaultdict(Counter)
     have_subjects = subjects is not None and len(subjects) == len(texts)
+    # Pre-pass: DF stats
     for idx, (txt, c) in enumerate(zip(texts, labels)):
         c = int(c)
         toks = tokenize_clean(txt)
+        uni_set = set(toks)
+        bg_set  = set(ngrams(toks, 2))
+        tri_set = set(ngrams(toks, 3))
+        # DF
+        glob_df_uni.update(uni_set)
+        glob_df_bg.update(bg_set)
+        glob_df_tri.update(tri_set)
+        # Per-cluster counts
+        per_c_bg[c].update(bg_set)
+        per_c_tri[c].update(tri_set)
         per_c_texts[c].append(" ".join(toks))
         per_c_doc_count[c] += 1
+        # Subject presence
         if have_subjects:
+            stoks = tokenize_clean(subjects[idx] or "")
+            s_uni = set(stoks)
+            s_bg  = set(ngrams(stoks, 2))
+            s_tri = set(ngrams(stoks, 3))
+            per_c_subj_uni_docs[c].update(s_uni)
+            per_c_subj_bg_docs[c].update(s_bg)
+            per_c_subj_tri_docs[c].update(s_tri)
+    N = max(1, len(texts))
     labels_out = {}
+    # Helper: ubiquity filter
+    def too_ubiquitous(df_count):  # fraction of docs
+        return (df_count / float(N)) > float(global_ubiq_cut)
     for c in sorted(set(int(x) for x in labels)):
         n_docs_c = max(1, per_c_doc_count[c])
+        # PMI bigrams & trigrams with subject-coverage boost
+        phrases = []
+        for store, glob_df, subj_docs, n in (
+            (per_c_bg[c],  glob_df_bg,  per_c_subj_bg_docs[c], 2),
+            (per_c_tri[c], glob_df_tri, per_c_subj_tri_docs[c], 3),
+        ):
+            total_c = sum(store.values()) + 1e-12
+            total_g = sum(glob_df.values()) + 1e-12
+            scored = []
+            for ng, cnt in store.most_common(3000):
+                if too_ubiquitous(glob_df[ng]):  # skip ubiquitous n-grams
+                    continue
+                p_ng_c = cnt / total_c
+                p_ng_g = (glob_df[ng] / total_g)
+                if p_ng_c > 0 and p_ng_g > 0:
+                    score = _math.log(p_ng_c) - _math.log(p_ng_g)
+                    cov = 0.0
+                    if have_subjects:
+                        cov = subj_docs[ng] / n_docs_c
+                    score += subject_alpha * cov
+                    scored.append((score, ng))
+            scored.sort(reverse=True)
+            # take a couple from each class to avoid only-bigrams/only-trigrams
+            take = max(1, topn // (3 if n == 3 else 2))
+            phrases.extend([p for _, p in scored[:take]])
+        # Class-TFIDF unigrams with subject coverage boost
         docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
+        docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
         corpus = [docs_c[0], docs_bg[0]]
         vec = TfidfVectorizer(
             analyzer="word", ngram_range=(1,1),
         vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
         row = X[0].toarray().ravel()
+        # Subject coverage vector
         subj_cov = np.zeros_like(row)
         if have_subjects:
             vocab_index = {t:i for i,t in enumerate(vocab)}
             for tok, cnt_docs in per_c_subj_uni_docs[c].items():
+                if tok in vocab_index and not is_junk_token(tok):
+                    subj_cov[vocab_index[tok]] = cnt_docs / n_docs_c
+        row_boosted = row + subject_alpha * subj_cov
+        order = row_boosted.argsort()[::-1]
+        unis = []
+        for i in order:
             tok = vocab[i]
+            if is_junk_token(tok): continue
+            if too_ubiquitous(glob_df_uni.get(tok, 0)):  # suppress ubiquitous tokens
+                continue
+            unis.append(tok)
+            if len(unis) >= max(0, topn - len(phrases)):
                 break
+        parts = (phrases + unis)[:max(2, topn)]
         labels_out[c] = ", ".join(parts) if parts else f"cluster_{c}"
     return labels_out
             max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
             use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
             skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
+        with gr.Row():
+            use_hashing = gr.Checkbox(label="Use HashingVectorizer (memory-light, fast)", value=True)  # NEW
+            hash_bits   = gr.Slider(label="Hashing bits (2^n features)", minimum=16, maximum=20, step=1, value=18)  # NEW
         with gr.Row():
             use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
             lsa_dim = gr.Number(label="LSA components", value=256, precision=0)  # richer default
             auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
             k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
             mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
+        with gr.Row():
+            use_hdbscan = gr.Checkbox(label="Use HDBSCAN (auto-k, noise) on reduced vectors", value=False)  # NEW
+            hdb_min_cluster = gr.Number(label="HDBSCAN min_cluster_size", value=60, precision=0)  # NEW
+            hdb_min_samples = gr.Number(label="HDBSCAN min_samples (0=auto)", value=0, precision=0)  # NEW
+        with gr.Row():
+            per_language = gr.Checkbox(label="Cluster per language (reduces cross-language mixing)", value=True)  # NEW
         with gr.Row():
             use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
             use_iso = gr.Checkbox(label="Compute anomaly score (IsolationForest on LSA)", value=False)
             trusted_domains_in = gr.Textbox(label="Trusted org domains (comma-separated)", value="example.gov, example.org")
             extra_keywords_in  = gr.Textbox(label="Extra suspicious phrases (comma-separated)", value="")
             highlight_toggle   = gr.Checkbox(label="Highlight suspect patterns in reader", value=True)
+        with gr.Row():
+            use_embeddings = gr.Checkbox(label="Add lightweight word embeddings (avg word2vec/GloVe) if available", value=False)  # NEW
+            embed_weight   = gr.Slider(label="Embedding weight in feature space", minimum=0.0, maximum=1.0, step=0.05, value=0.35)  # NEW
+        with gr.Row():
+            embeddings_path = gr.Textbox(label="Path to local embeddings (.txt/.vec/.bin) (optional)", value="")  # NEW
+            embeddings_binary = gr.Checkbox(label="File is binary word2vec format", value=False)  # NEW
         with gr.Row():
             cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
             domain_drop  = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
     with gr.Row():
         cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
         domain_counts_df  = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
+    with gr.Row():
+        sender_counts_df = gr.Dataframe(label="Top senders", interactive=False, wrap=True)  # NEW
     with gr.Row():
         actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
     ) -> pd.DataFrame:
         out = df
         if cluster and cluster != "(any)":
+            m = re.match(r"^(\-?\d+)\s+—", cluster)
             if m:
                 cid = int(m.group(1))
                 out = out[out["cluster_id"] == cid]
     # -------- Main pipeline --------
     def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
                      use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
+                     trusted_domains_in, extra_keywords_in, highlight_toggle,
+                     # NEW:
+                     use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
+                     per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary):
         if inbox_file is None:
             return ("**Please upload a file.**",
+                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
                     None, None, None, None)
+        # === Vectorization & Clustering (UPGRADED) ===
+        def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
+            # “texts” feed vectorizers; “subjects_only” helps labels
+            texts = list(df_in.apply(enrich_text, axis=1))
+            subjects_only = list(df_in["subject"].fillna(""))
+            return texts, subjects_only
+        def _vectorize_block(
+            texts: List[str],
+            use_bigrams: bool,
+            max_features: int,
+            min_df: int,
+            max_df: float,
+            use_hashing: bool,
+            hash_bits: int
+        ):
+            """
+            Return (X_full_csr, count_vec, char_vec, bm25, d_word, d_char, d_full)
+            Uses Count+BM25 (+ char-tfidf) or Hashing+TfidfTransformer (+ char-tfidf).
+            """
+            ngram_range = (1, 2) if use_bigrams else (1, 1)
+            if use_hashing:
+                # HashingVectorizer -> counts -> TfidfTransformer (as IDF) -> BM25-like not exact.
+                # We’ll keep TF-IDF for words (IDF approximates) + char-grams TF-IDF (unchanged).
+                hv = HashingVectorizer(
+                    analyzer="word",
+                    ngram_range=ngram_range,
+                    n_features=2 ** int(hash_bits),
+                    alternate_sign=False,  # keep positivity
+                    token_pattern=TOKEN_PATTERN,
+                    lowercase=True,
+                    norm=None  # raw counts first
+                )
+                word_counts = hv.transform(texts)  # CSR
+                # IDF (approximate TF-IDF since we lack exact DF per token str; good enough in practice here)
+                tfidf_tr = TfidfTransformer()
+                X_word = tfidf_tr.fit_transform(word_counts).astype(np.float32)
+                char_vec = CharTfidf(
+                    analyzer="char", ngram_range=(3, 5),
+                    min_df=2, max_features=100_000, lowercase=True, dtype=np.float32
+                )
+                X_char = char_vec.fit_transform(texts)
+                X_full = hstack([X_word, X_char * 0.20], format="csr")
+                d_word, d_char, d_full = X_word.shape[1], X_char.shape[1], X_word.shape[1] + X_char.shape[1]
+                # For downstream code compatibility, expose “count_vec” and “bm25” placeholders
+                count_vec = None
+                bm25 = None
+                return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full
+            # Original Count -> BM25 + char
+            count_vec = CountVectorizer(
+                analyzer="word", ngram_range=ngram_range,
+                max_features=int(max_features) if max_features else None,
+                min_df=int(min_df) if min_df else 2,
+                max_df=float(max_df) if max_df else 0.7,
+                token_pattern=TOKEN_PATTERN, lowercase=True,
+                dtype=np.float32, stop_words=STOPWORD_FOR_VEC
+            )
+            TF = count_vec.fit_transform(texts)
+            bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
+            X_word = bm25.transform(TF)
+            char_vec = CharTfidf(
+                analyzer="char", ngram_range=(3, 5),
+                min_df=2, max_features=100_000, lowercase=True, dtype=np.float32
+            )
+            X_char = char_vec.fit_transform(texts)
+            X_full = hstack([X_word, X_char * 0.20], format="csr")
+            d_word, d_char, d_full = X_word.shape[1], X_char.shape[1], X_word.shape[1] + X_char.shape[1]
+            return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full
+        def _reduce_space(X_full, use_lsa, lsa_dim):
+            svd_obj = None
+            norm_obj = None
+            X_reduced = None
+            if use_lsa:
+                svd_obj = TruncatedSVD(n_components=int(lsa_dim or 256), random_state=0)
+                Xtmp = svd_obj.fit_transform(X_full)  # dense
+                norm_obj = Normalizer(copy=False)
+                X_reduced = norm_obj.fit_transform(Xtmp).astype(np.float32)
+                del Xtmp; gc.collect()
+            return X_reduced, svd_obj, norm_obj
+        def _attach_embeddings(texts, X_reduced_or_full, use_lsa, kv, emb_dim, weight):
+            """
+            Concatenate averaged word vectors (dense) into the current space.
+            We convert dense embeddings to CSR to safely hstack with CSR (if using non-LSA path).
+            """
+            if kv is None or emb_dim <= 0 or weight <= 0.0:
+                return X_reduced_or_full, emb_dim
+            doc_embs = _build_doc_embeddings(texts, kv, emb_dim).astype(np.float32)
+            if weight != 1.0:
+                doc_embs *= float(weight)
+            if isinstance(X_reduced_or_full, np.ndarray):
+                # LSA path (dense): concat dense to dense
+                return np.hstack([X_reduced_or_full, doc_embs]).astype(np.float32), emb_dim
+            else:
+                # Sparse path: convert embeddings to CSR and hstack
+                X_emb = csr_matrix(doc_embs)
+                return hstack([X_reduced_or_full, X_emb], format="csr"), emb_dim
+        def _cluster_space(
+            X_space,
+            df_part: pd.DataFrame,
+            use_lsa: bool,
+            use_hdbscan: bool,
+            hdb_min_cluster: int,
+            hdb_min_samples: int,
+            auto_k: bool,
+            k_clusters: int,
+            mb_batch: int,
+            count_vec,
+            svd_obj,
+            norm_obj,
+            d_word, d_char
+        ):
+            """
+            Run HDBSCAN (if requested and available) or MiniBatchKMeans.
+            Return: labels (np.array), centers (np.ndarray) or None if HDBSCAN, and chosen_k.
+            """
+            if use_hdbscan and HDBSCAN_OK and isinstance(X_space, np.ndarray) and X_space.shape[0] >= max(50, hdb_min_cluster):
+                # HDBSCAN on LSA (dense) only
+                min_samples = None if int(hdb_min_samples or 0) <= 0 else int(hdb_min_samples)
+                clusterer = hdbscan.HDBSCAN(
+                    min_cluster_size=int(hdb_min_cluster or 60),
+                    min_samples=min_samples,
+                    metric='euclidean',  # cosine≈euclidean on L2-normalized vectors
+                    cluster_selection_epsilon=0.0,
+                    core_dist_n_jobs=1  # CPU-friendly on HF
+                )
+                labels = clusterer.fit_predict(X_space)
+                centers = None
+                chosen_k = int(len(set([l for l in labels if l >= 0])))
+                return labels, centers, chosen_k
+            # Otherwise MiniBatchKMeans (supports dense or sparse)
+            if bool(auto_k):
+                if use_lsa and isinstance(X_space, np.ndarray):
+                    k, _ = choose_k_by_kneedle(X_space, ks=(50,100,150,200,300,400,500))
+                else:
+                    k = auto_k_rule(X_space.shape[0])
+            else:
+                k = max(10, int(k_clusters or 350))
+            init = None
+            if use_lsa and isinstance(X_space, np.ndarray) and count_vec is not None:
+                seeds = seeded_centroids_in_lsa(
+                    CORR_LEX, count_vec, svd_obj.components_, norm_obj,
+                    d_word=d_word, d_full=(d_word + d_char), k=k
+                )
+                if seeds is not None and seeds.shape[0] == k:
+                    init = seeds
+            kmeans = MiniBatchKMeans(
+                n_clusters=k, batch_size=int(mb_batch or 4096),
+                random_state=0, n_init="auto" if init is None else 1,
+                init="k-means++" if init is None else init
+            )
+            labels = kmeans.fit_predict(X_space)
+            centers = kmeans.cluster_centers_ if hasattr(kmeans, "cluster_centers_") else None
+            if use_lsa and centers is not None:
+                labels = merge_close_clusters(labels, centers, thresh=0.95)
+            chosen_k = int(len(set(labels)))
+            return labels, centers, chosen_k
         # trusted org domains
         trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
         recs = _load_json_records(inbox_file.name)
         if not recs:
             return ("**No valid records found.**",
+                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
                     None, None, None, None)
         # Normalize
         normd = []
         for r in tqdm(recs, desc="Normalize", leave=False):
+            out = normalize_email_record(r, use_langdetect=(not bool(skip_lang)))
             if out and out.get("body_text") is not None:
                 normd.append(out)
         df = pd.DataFrame(normd)
         if df.empty:
             return ("**No usable email records after normalization.**",
+                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
                     None, None, None, None)
         # Deduplicate conservatively
         df_news   = df[df["is_news"]].reset_index(drop=True)
         df_alerts = df[df["is_notify"]].reset_index(drop=True)
+        # ----- Build texts (and optionally partition by language) -----
+        use_lang = not bool(skip_lang)
+        # Optional embeddings
+        kv = None; emb_dim = 0
+        if bool(use_embeddings):
+            kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
+        # Optional per-language partitioning to avoid cross-language mixing
+        parts = []
+        if bool(per_language) and "lang" in df_main.columns:
+            for lang_code, grp in df_main.groupby("lang"):
+                if lang_code in (None, "", "unknown"):
+                    # keep unknown together
+                    parts.append(("unknown", grp.reset_index(drop=True)))
+                else:
+                    parts.append((lang_code, grp.reset_index(drop=True)))
         else:
+            parts = [("all", df_main.reset_index(drop=True))]
+        # Collect per-part results to concatenate
+        labels_list = []
+        cluster_name_list = []
+        anomaly_list = []
+        X_reduced_holder = None  # only kept if use_lsa True and single-part; else None
+        nn_index_obj = None      # per overall (we build on all df_main at end)
+        term_names_global = {}
+        # We'll build a global ANN/search index only if single partition & LSA on
+        single_partition = (len(parts) == 1)
+        d_word_agg = 0; d_char_agg=0;
+        k_agg = 0
+        for p_lang, df_part in parts:
+            if df_part.empty: continue
+            texts, subjects_only = _make_texts(df_part)
+            # Vectorize
+            X_full, count_vec, char_vec, bm25_local, d_word, d_char, d_full = _vectorize_block(
+                texts=texts,
+                use_bigrams=bool(use_bigrams),
+                max_features=int(max_features or 120000),
+                min_df=int(min_df or 3),
+                max_df=float(max_df or 0.7),
+                use_hashing=bool(use_hashing),
+                hash_bits=int(hash_bits or 18)
             )
+            d_word_agg += d_word; d_char_agg += d_char
+            # Dim reduction
+            X_reduced, svd_obj_local, norm_obj_local = _reduce_space(X_full, bool(use_lsa), int(lsa_dim or 256))
+            X_space = (X_reduced if X_reduced is not None else X_full)
+            # Optional embeddings concat
+            if kv is not None and emb_dim > 0 and float(embed_weight or 0) > 0:
+                X_space, _ = _attach_embeddings(texts, X_space, bool(use_lsa), kv, emb_dim, float(embed_weight))
+            # Optional anomaly (on LSA only, before clustering)
+            anomaly_scores = np.full((len(df_part),), np.nan, dtype=np.float32)
+            if X_reduced is not None and bool(use_iso) and ISO_OK and X_reduced.shape[0] >= 50:
+                try:
+                    iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
+                    iso.fit(X_reduced)
+                    anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32)
+                except Exception:
+                    pass
+            # Cluster (HDBSCAN or MiniBatchKMeans)
+            labels, centers, chosen_k = _cluster_space(
+                X_space=X_space, df_part=df_part, use_lsa=bool(use_lsa),
+                use_hdbscan=bool(use_hdbscan), hdb_min_cluster=int(hdb_min_cluster or 60),
+                hdb_min_samples=int(hdb_min_samples or 0),
+                auto_k=bool(auto_k), k_clusters=int(k_clusters or 350), mb_batch=int(mb_batch or 4096),
+                count_vec=count_vec, svd_obj=svd_obj_local, norm_obj=norm_obj_local,
+                d_word=d_word, d_char=d_char
+            )
+            k_agg += chosen_k
+            # HDBSCAN yields -1 noise; keep as-is. KMeans may have merges above.
+            # Labels -> cluster names
+            term_names = cluster_labels_pmi_bigram(
+                texts=texts, labels=labels, subjects=subjects_only, topn=6, subject_alpha=0.75, global_ubiq_cut=0.20
+            )
+            term_names_global.update({int(k): v for k, v in term_names.items()})
+            labels_list.append(pd.Series(labels, index=df_part.index))
+            cluster_name_list.append(pd.Series([term_names.get(int(c), f"noise_{int(c)}" if c < 0 else f"cluster_{int(c)}") for c in labels], index=df_part.index))
+            anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
+            # If single partition and LSA on, we’ll build ANN on that space after loop
+            if single_partition and bool(use_lsa) and X_reduced is not None and X_reduced_holder is None:
+                X_reduced_holder = X_reduced
+        # Stitch part results back to df_main order
+        if not labels_list: # handle case where df_main was empty
+             df_main["cluster_id"] = -10
+             df_main["cluster_name"] = "unclustered"
+             df_main["anomaly_score"] = np.nan
+        else:
+             df_main = df_main.copy()
+             df_main["cluster_id"] = pd.concat(labels_list).sort_index()
+             df_main["cluster_name"] = pd.concat(cluster_name_list).sort_index()
+             df_main["anomaly_score"] = pd.concat(anomaly_list).sort_index()
+        # Keep your special buckets
+        if len(df_news):
+            df_news["cluster_id"] = -1; df_news["cluster_name"] = "newsletter/news"; df_news["anomaly_score"] = np.nan
         if len(df_alerts):
+            df_alerts["cluster_id"] = -2; df_alerts["cluster_name"] = "system/alerts"; df_alerts["anomaly_score"] = np.nan
         # Combine back
         df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
+        # Corruption score (unchanged)
         df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
+        # --- Build search index (Faiss on LSA if single partition; else fallback to cosine brute) ---
+        use_faiss = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
         index_obj = None
         if use_faiss:
+            d = X_reduced_holder.shape[1]
+            index_obj = faiss.IndexFlatIP(d)
+            index_obj.add(X_reduced_holder)
         else:
+            # fallback: brute cosine on feature space of df_main again (we need a space to fit)
+            # For multi-part, just use corruption/search blending without ANN speedup (NearestNeighbors over dense LSA best-effort)
+            try:
+                if bool(use_lsa) and X_reduced_holder is not None and single_partition:
+                    nn = NearestNeighbors(metric="cosine", algorithm="brute")
+                    nn.fit(X_reduced_holder)
+                    index_obj = nn
+                else:
+                    # If multi-part or non-LSA, do a minimal brute heuristic by re-vectorizing only for search when needed.
+                    # We'll defer exact vectors to search_fn’s projection.
+                    nn = NearestNeighbors(metric="cosine", algorithm="brute")
+                    # Fit on a tiny placeholder so object exists; we’ll guard in search.
+                    nn.fit(np.zeros((1, 4), dtype=np.float32))
+                    index_obj = nn
+            except Exception:
+                index_obj = None
         # Summaries
         cluster_counts = (
+            df[df["cluster_id"] >= 0]
               .groupby(["cluster_id", "cluster_name"]).size()
               .reset_index(name="count")
               .sort_values("count", ascending=False)
         # Append buckets
         news_count   = int((df["cluster_id"] == -1).sum())
         alerts_count = int((df["cluster_id"] == -2).sum())
+        hdbscan_noise = int((df["cluster_id"] < -2).sum())
         extra_rows = []
         if news_count > 0:
             extra_rows.append({"cluster_id": -1, "cluster_name": "newsletter/news", "count": news_count})
         if alerts_count > 0:
             extra_rows.append({"cluster_id": -2, "cluster_name": "system/alerts", "count": alerts_count})
+        if hdbscan_noise > 0:
+            extra_rows.append({"cluster_id": -3, "cluster_name": "HDBSCAN noise", "count": hdbscan_noise})
         if extra_rows:
             cluster_counts = pd.concat([cluster_counts, pd.DataFrame(extra_rows)], ignore_index=True)
         )
         sender_choices = ["(any)"] + sender_counts["from_email"].tolist()
         # Languages present
         langs = [l for l in sorted(df["lang"].dropna().unique()) if l and l!="unknown"]
         lang_choices = ["(any)"] + langs
         cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
         out_table = show_df[cols_out].head(500)
+        # The vectorizer state is complex now with partitioning. For search, we need to rebuild it on the fly.
+        # So we store the params to do so.
+        vec_state = {
+            "use_hashing": bool(use_hashing), "hash_bits": int(hash_bits),
+            "max_features": int(max_features), "min_df": int(min_df), "max_df": float(max_df),
+            "use_bigrams": bool(use_bigrams),
+            # dummy objects for search fn compatibility
+             "count_vec": None, "char_vec": None, "bm25": None
+        }
         status_md = (
             f"**Processed {len(df):,} emails**  \n"
+            f"Word feats: {d_word_agg:,}  |  Char feats: {d_char_agg:,} (x0.20) \n"
+            f"{'LSA: ' + str(X_reduced_holder.shape[1]) + ' dims  |  ' if X_reduced_holder is not None else ''}"
+            f"k = {k_agg}  |  Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'}  |  "
             f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
         )
         domain_update  = gr.update(choices=domain_choices,  value="(any)")
         sender_update  = gr.update(choices=sender_choices,  value="(any)")
         lang_update    = gr.update(choices=lang_choices,    value="(any)")
+        # NOTE: svd_obj and norm_obj are now local to partitions. We can only pass back the single-partition one for search.
+        svd_obj_out = svd_obj_local if single_partition and 'svd_obj_local' in locals() else None
+        norm_obj_out = norm_obj_local if single_partition and 'norm_obj_local' in locals() else None
         return (
             status_md,
+            cluster_counts, domain_counts, sender_counts,
             actors, offhours_table,
             out_table,
+            df, vec_state, (X_reduced_holder if bool(use_lsa) else None), index_obj, term_names_global,
+            bool(use_lsa), bool(use_faiss),
             cluster_update, domain_update, sender_update, lang_update,
+            svd_obj_out, norm_obj_out,
+            (d_word_agg, d_char_agg),
             extra_terms_lower, bool(highlight_toggle)
         )
     (run_btn.click)(
         process_file,
+        inputs=[
+            inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
+            use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
+            trusted_domains_in, extra_keywords_in, highlight_toggle,
+            # NEW:
+            use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
+            per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary
+        ],
         outputs=[status,
+                 cluster_counts_df, domain_counts_df, sender_counts_df,
                  actors_df, offhours_df,
                  results_df,
                  state_df, state_vec, state_X_reduced, state_index, state_term_names,
             tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
         tmp = tmp.drop(columns=["_dt"])
         cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
+        if "search_score" in tmp.columns:
+            cols_out.append("search_score")
         acc = [c for c in cols_out if c in tmp.columns]
         return tmp[acc].head(500)
         except Exception:
             return None
+    def _vectorize_query(q: str, vec_state: Dict[str, Any], corpus_texts_for_fit: List[str]):
+        # This has to re-fit a vectorizer to project the query, since partitioning means
+        # we don't have a single global one.
+        if vec_state.get("use_hashing"):
+            hv = HashingVectorizer(
+                analyzer="word", ngram_range=(1,2) if vec_state.get('use_bigrams') else (1,1),
+                n_features=2**vec_state.get('hash_bits', 18), alternate_sign=False,
+                token_pattern=TOKEN_PATTERN, lowercase=True, norm=None
+            )
+            # fit to get the transformer state on the corpus
+            word_counts = hv.fit_transform(corpus_texts_for_fit)
+            tfidf_tr = TfidfTransformer().fit(word_counts)
+            q_word_counts = hv.transform([q])
+            q_word = tfidf_tr.transform(q_word_counts)
+        else: # BM25 path
+            count_vec = CountVectorizer(
+                analyzer="word", ngram_range=(1,2) if vec_state.get('use_bigrams') else (1,1),
+                max_features=vec_state.get('max_features'), min_df=vec_state.get('min_df'),
+                max_df=vec_state.get('max_df'), token_pattern=TOKEN_PATTERN, lowercase=True,
+                dtype=np.float32, stop_words=STOPWORD_FOR_VEC
+            )
+            TF = count_vec.fit_transform(corpus_texts_for_fit)
+            bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
+            q_word_tf = count_vec.transform([q])
+            q_word = bm25.transform(q_word_tf)
+        char_vec = CharTfidf(
+            analyzer="char", ngram_range=(3,5), min_df=2, max_features=100_000,
+            lowercase=True, dtype=np.float32
+        ).fit(corpus_texts_for_fit)
+        q_char = char_vec.transform([q])
+        q_full = hstack([q_word, q_char * 0.20], format="csr")
         return q_full
     def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
         if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
             return pd.DataFrame(), []
+        # align with df_main order (exclude -1 and -2)
+        mask = ~df["cluster_id"].isin([-1, -2, -3])
+        filtered_df = df[mask].reset_index(drop=True)
+        if filtered_df.empty:
+            return pd.DataFrame(), []
         q_terms = _tokenize_query(q)
+        q_vec_full = _vectorize_query(q, vec_state, corpus_texts_for_fit=list(filtered_df.apply(enrich_text, axis=1)))
+        if use_lsa_flag and (X_reduced is not None) and (svd_obj is not None) and (norm_obj is not None):
             q_emb = _project_query_to_lsa(q_vec_full, svd_obj, norm_obj)
             if q_emb is None:
                 return pd.DataFrame(), q_terms
         else:
             q_emb = q_vec_full
+        n_neighbors = min(50, len(filtered_df))
+        if n_neighbors <= 0: return pd.DataFrame(), q_terms
         if isinstance(index_obj, NearestNeighbors):
+            # Check if index was fit on placeholder
+            if hasattr(index_obj, 'n_samples_fit_') and index_obj.n_samples_fit_ <= 1:
+                return pd.DataFrame(), q_terms # cannot search
+            distances, indices = index_obj.kneighbors(q_emb, n_neighbors=n_neighbors)
             inds = indices[0]
             sims = 1.0 - distances[0]
             results = filtered_df.iloc[inds].copy()
             results["search_score"] = sims
         elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
+            D, I = index_obj.search(q_emb.astype(np.float32), k=n_neighbors)
             inds = I[0]
             sims = D[0]
             results = filtered_df.iloc[inds].copy()
         # blend with corruption score lightly
         cs = results["corruption_score"].fillna(0.0)
+        cs_norm = (cs - cs.min()) / (cs.max() - cs.min() + 1e-9) if (cs.max() > cs.min()) else cs
+        results["_blend"] = 0.7*results["search_score"].values + 0.3*cs_norm.values
         # sort UI-selected way
         if sort_by == "search_score":
             results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
                 results = results.sort_values("_blend", ascending=(sort_dir=="asc"))
         results = results.drop(columns=["_blend"])
         cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score", "search_score"]
+        return _sort_results(results.head(50), sort_by, sort_dir), q_terms
     search_btn.click(
         search_fn,
         if dstr is not None:
             cand = cand[cand["date"] == dstr]
         if len(cand) == 0:
+            # Fallback to subject only if exact match fails
             cand = df[df["subject"] == sel.get("subject", "")]
         if len(cand) == 0:
+            return f"Could not find original record for: {subj}"
         row = cand.iloc[0]
+        cid = int(row.get("cluster_id", -99))
+        clabel = term_names.get(cid, row.get("cluster_name")) if term_names else row.get("cluster_name")
         return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel, do_highlight=bool(do_highlight), extra_terms=extra_terms)
     results_df.select(
             return "(any)"
         label = df_sum.iloc[row_idx]["label"]
         return label if isinstance(label, str) else "(any)"
+    # Click domain summary to filter
+    def on_domain_click(evt: gr.SelectData, df_sum: pd.DataFrame):
+        try:
+            row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
+        except Exception:
+            row_idx = evt.index if hasattr(evt, "index") else None
+        if row_idx is None or df_sum is None or len(df_sum)==0:
+            return "(any)"
+        val = df_sum.iloc[row_idx]["from_domain"]
+        return val if isinstance(val, str) and val else "(any)"
     cluster_counts_df.select(
         on_cluster_click,
         inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
         outputs=[results_df]
     )
+    domain_counts_df.select(on_domain_click, inputs=[domain_counts_df], outputs=[domain_drop]) \
+    .then(
+        refresh_results,
+        inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
+        outputs=[results_df]
+    )
+    def on_sender_click(evt: gr.SelectData, df_sum: pd.DataFrame):
+        try:
+            row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
+        except Exception:
+            row_idx = evt.index if hasattr(evt, "index") else None
+        if row_idx is None or df_sum is None or len(df_sum)==0:
+            return "(any)"
+        val = df_sum.iloc[row_idx]["from_email"]
+        return val if isinstance(val, str) and val else "(any)"
+    sender_counts_df.select(on_sender_click, inputs=[sender_counts_df], outputs=[sender_drop]) \
+        .then(
+            refresh_results,
+            inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
+            outputs=[results_df]
+        )
 if __name__ == "__main__":
+    demo.launch()