wuhp commited on
Commit
609d65e
·
verified ·
1 Parent(s): e12524d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +609 -222
app.py CHANGED
@@ -25,6 +25,22 @@ from sklearn.preprocessing import Normalizer
25
  from sklearn.preprocessing import normalize as sk_normalize
26
  from sklearn.metrics.pairwise import cosine_similarity
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # Optional light anomaly detection
29
  try:
30
  from sklearn.ensemble import IsolationForest
@@ -571,6 +587,62 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
571
  )
572
  return html
573
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  # =================== Feature engineering (BM25 + char) ===================
575
  class BM25Transformer:
576
  def __init__(self, k1=1.2, b=0.75):
@@ -615,96 +687,117 @@ def enrich_text(row: pd.Series) -> str:
615
  return (t + " " + " ".join(tokens)).strip()
616
 
617
  # =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST ===================
618
- def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alpha=0.75):
619
  """
620
- Create human-readable labels per cluster using:
621
- 1) PMI bigrams (cluster vs global) + subject coverage boost
622
- 2) Class-TFIDF unigrams (cluster vs rest) + subject coverage boost
623
-
624
- `subjects`: list of subject strings aligned with `texts`
625
- `subject_alpha`: weight added per token = alpha * coverage_in_subjects (0..1)
626
  """
627
  import math as _math
628
  from collections import Counter, defaultdict
629
  from sklearn.feature_extraction.text import TfidfVectorizer
630
 
631
- HEADER_STOP = {"subject","re","fw","fwd","to","cc","bcc","from","sent","forwarded","回复","主题","收件人","发件人"}
 
632
 
633
  def is_junk_token(tok: str) -> bool:
634
  if _is_junk_term(tok): return True
635
  tl = tok.lower()
636
- if tl.startswith("__"): return True # hide __LANG/__HAS in labels
637
- if tl in STOP_TERMS: return True # extra HTML/MIME junk
638
  if tl in HEADER_STOP: return True
639
  if "@" in tl: return True
640
- # drop short ASCII like "eb/ys/yl"
641
  if tl.isascii() and len(tl) <= 2: return True
642
- # punctuation blobs (keep apostrophes)
643
- if re.search(r"[^\w\-']", tl):
644
- if "’" not in tl and "'" not in tl:
645
- return True
646
  return False
647
 
648
  def tokenize_clean(t):
649
- toks = re.findall(TOKEN_PATTERN, t.lower())
650
  return [w for w in toks if not is_junk_token(w)]
651
 
652
- def bigrams(toks):
653
- return [" ".join(p) for p in zip(toks, toks[1:]) if all(not is_junk_token(x) for x in p)]
654
 
655
- glob_bg = Counter()
 
 
 
656
  per_c_bg = defaultdict(Counter)
 
657
  per_c_texts = defaultdict(list)
658
  per_c_doc_count = defaultdict(int)
659
-
660
- # SUBJECT presence (unique tokens/bigrams per subject per doc)
661
  per_c_subj_uni_docs = defaultdict(Counter)
662
  per_c_subj_bg_docs = defaultdict(Counter)
 
663
 
664
  have_subjects = subjects is not None and len(subjects) == len(texts)
665
 
 
666
  for idx, (txt, c) in enumerate(zip(texts, labels)):
667
  c = int(c)
668
  toks = tokenize_clean(txt)
669
- bgs = set(bigrams(toks))
670
- glob_bg.update(bgs)
671
- per_c_bg[c].update(bgs)
 
 
 
 
 
 
 
672
  per_c_texts[c].append(" ".join(toks))
673
  per_c_doc_count[c] += 1
674
-
675
  if have_subjects:
676
- subj_toks = tokenize_clean(subjects[idx] or "")
677
- subj_uni_set = set(subj_toks)
678
- subj_bg_set = set(bigrams(subj_toks))
679
- per_c_subj_uni_docs[c].update(subj_uni_set)
680
- per_c_subj_bg_docs[c].update(subj_bg_set)
681
-
 
 
 
682
  labels_out = {}
683
- total_bg = sum(glob_bg.values()) + 1e-12
 
 
 
684
 
685
  for c in sorted(set(int(x) for x in labels)):
686
  n_docs_c = max(1, per_c_doc_count[c])
687
 
688
- # PMI bigrams (+ subject boost)
689
- scores = []
690
- total_c = sum(per_c_bg[c].values()) + 1e-12
691
- for bg, cnt in per_c_bg[c].most_common(2000):
692
- p_bg_c = cnt / total_c
693
- p_bg = (glob_bg[bg] / total_bg)
694
- if p_bg > 0 and p_bg_c > 0:
695
- score = _math.log(p_bg_c) - _math.log(p_bg)
696
- # subject coverage boost: fraction of cluster docs whose SUBJECT contains this bigram
697
- cov = 0.0
698
- if have_subjects:
699
- cov = per_c_subj_bg_docs[c][bg] / n_docs_c
700
- score = score + subject_alpha * cov
701
- scores.append((score, bg))
702
- scores.sort(reverse=True)
703
- top_bi = [bg for _, bg in scores[: max(2, topn//2) ]]
704
-
705
- # class-TFIDF unigrams (cluster doc vs. background doc) + subject boost
 
 
 
 
 
 
 
 
 
706
  docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
707
- docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k!=c), [])) or " "]
708
  corpus = [docs_c[0], docs_bg[0]]
709
  vec = TfidfVectorizer(
710
  analyzer="word", ngram_range=(1,1),
@@ -714,30 +807,27 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
714
  vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
715
  row = X[0].toarray().ravel()
716
 
717
- # Build subject coverage vector over this vocab
718
  subj_cov = np.zeros_like(row)
719
  if have_subjects:
720
  vocab_index = {t:i for i,t in enumerate(vocab)}
721
  for tok, cnt_docs in per_c_subj_uni_docs[c].items():
722
- if tok in vocab_index:
723
- subj_cov[vocab_index[tok]] = cnt_docs / n_docs_c # 0..1
724
-
725
- # Apply boost (only to non-junk tokens)
726
- row_boosted = row.copy()
727
- for i, tok in enumerate(vocab):
728
- if subj_cov[i] > 0 and not is_junk_token(tok):
729
- row_boosted[i] = row[i] + subject_alpha * float(subj_cov[i])
730
-
731
- top_idx = row_boosted.argsort()[::-1][: max(0, topn - len(top_bi)) ]
732
- top_uni = []
733
- for i in top_idx:
734
  tok = vocab[i]
735
- if not is_junk_token(tok):
736
- top_uni.append(tok)
737
- if len(top_uni) >= max(0, topn - len(top_bi)):
 
 
738
  break
739
 
740
- parts = top_bi + top_uni
741
  labels_out[c] = ", ".join(parts) if parts else f"cluster_{c}"
742
 
743
  return labels_out
@@ -898,12 +988,21 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
898
  max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
899
  use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
900
  skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
 
 
 
901
  with gr.Row():
902
  use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
903
  lsa_dim = gr.Number(label="LSA components", value=256, precision=0) # richer default
904
  auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
905
  k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
906
  mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
 
 
 
 
 
 
907
  with gr.Row():
908
  use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
909
  use_iso = gr.Checkbox(label="Compute anomaly score (IsolationForest on LSA)", value=False)
@@ -913,6 +1012,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
913
  trusted_domains_in = gr.Textbox(label="Trusted org domains (comma-separated)", value="example.gov, example.org")
914
  extra_keywords_in = gr.Textbox(label="Extra suspicious phrases (comma-separated)", value="")
915
  highlight_toggle = gr.Checkbox(label="Highlight suspect patterns in reader", value=True)
 
 
 
 
 
 
916
  with gr.Row():
917
  cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
918
  domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
@@ -934,6 +1039,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
934
  with gr.Row():
935
  cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
936
  domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
 
 
 
937
 
938
  with gr.Row():
939
  actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
@@ -1003,7 +1111,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1003
  ) -> pd.DataFrame:
1004
  out = df
1005
  if cluster and cluster != "(any)":
1006
- m = re.match(r"^(\d+)\s+—", cluster)
1007
  if m:
1008
  cid = int(m.group(1))
1009
  out = out[out["cluster_id"] == cid]
@@ -1054,13 +1162,182 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1054
  # -------- Main pipeline --------
1055
  def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
1056
  use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
1057
- trusted_domains_in, extra_keywords_in, highlight_toggle):
 
 
 
1058
  if inbox_file is None:
1059
  return ("**Please upload a file.**",
1060
- None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1061
  None, None, None, None)
1062
 
1063
- use_lang = not bool(skip_lang)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1064
 
1065
  # trusted org domains
1066
  trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
@@ -1071,19 +1348,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1071
  recs = _load_json_records(inbox_file.name)
1072
  if not recs:
1073
  return ("**No valid records found.**",
1074
- None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1075
  None, None, None, None)
1076
 
1077
  # Normalize
1078
  normd = []
1079
  for r in tqdm(recs, desc="Normalize", leave=False):
1080
- out = normalize_email_record(r, use_langdetect=use_lang)
1081
  if out and out.get("body_text") is not None:
1082
  normd.append(out)
1083
  df = pd.DataFrame(normd)
1084
  if df.empty:
1085
  return ("**No usable email records after normalization.**",
1086
- None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1087
  None, None, None, None)
1088
 
1089
  # Deduplicate conservatively
@@ -1116,135 +1393,150 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1116
  df_news = df[df["is_news"]].reset_index(drop=True)
1117
  df_alerts = df[df["is_notify"]].reset_index(drop=True)
1118
 
1119
- # Enriched texts (adds __HAS_*__ flags + __LANG__)
1120
- texts = list(df_main.apply(enrich_text, axis=1))
1121
- subjects_only = list(df_main["subject"].fillna(""))
1122
-
1123
- # === Vectorization ===
1124
- ngram_range = (1, 2) if use_bigrams else (1, 1)
1125
- count_vec = CountVectorizer(
1126
- analyzer="word",
1127
- ngram_range=ngram_range,
1128
- max_features=int(max_features) if max_features else None,
1129
- min_df=int(min_df) if min_df else 2,
1130
- max_df=float(max_df) if max_df else 0.7,
1131
- token_pattern=TOKEN_PATTERN,
1132
- lowercase=True,
1133
- dtype=np.float32,
1134
- stop_words=STOPWORD_FOR_VEC, # list (not set)
1135
- )
1136
- TF = count_vec.fit_transform(texts)
1137
- bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
1138
- X_word = bm25.transform(TF) # sparse BM25 word matrix
1139
-
1140
- char_vec = CharTfidf(
1141
- analyzer="char", ngram_range=(3,5), min_df=2, max_features=100_000,
1142
- lowercase=True, dtype=np.float32
1143
- )
1144
- X_char = char_vec.fit_transform(texts)
1145
-
1146
- # Down-weight char-grams so they don't dominate geometry
1147
- X_full = hstack([X_word, X_char * 0.20], format="csr")
1148
- d_word = X_word.shape[1]
1149
- d_char = X_char.shape[1]
1150
- d_full = X_full.shape[1]
1151
-
1152
- # LSA
1153
- use_lsa = bool(use_lsa)
1154
- X_reduced = None
1155
- svd_obj = None
1156
- norm_obj = None
1157
- if use_lsa:
1158
- svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
1159
- X_reduced_tmp = svd_obj.fit_transform(X_full) # dense
1160
- norm_obj = Normalizer(copy=False)
1161
- X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
1162
- del X_reduced_tmp
1163
- gc.collect()
1164
-
1165
- # Optional anomaly detection (on LSA space)
1166
- anomaly_scores = np.full((len(df_main),), np.nan, dtype=np.float32)
1167
- if use_lsa and bool(use_iso) and ISO_OK and X_reduced is not None and X_reduced.shape[0] >= 50:
1168
- try:
1169
- iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
1170
- iso.fit(X_reduced)
1171
- anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32) # higher = more anomalous
1172
- except Exception:
1173
- pass
1174
 
1175
- # K selection
1176
- if bool(auto_k):
1177
- if use_lsa:
1178
- k, _ = choose_k_by_kneedle(X_reduced, ks=(50,100,150,200,300,400,500))
1179
- else:
1180
- k = auto_k_rule(X_full.shape[0])
 
 
 
 
 
 
 
 
1181
  else:
1182
- k = max(10, int(k_clusters or 350))
1183
-
1184
- # Optional seeded init (only in LSA space)
1185
- init = None
1186
- if use_lsa:
1187
- seeds = seeded_centroids_in_lsa(
1188
- CORR_LEX, count_vec, svd_obj.components_, norm_obj,
1189
- d_word=d_word, d_full=d_full, k=k
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1190
  )
1191
- if seeds is not None and seeds.shape[0] == k:
1192
- init = seeds
1193
-
1194
- # KMeans clustering (use LSA space if enabled)
1195
- X_space = (X_reduced if use_lsa else X_full)
1196
- kmeans = MiniBatchKMeans(
1197
- n_clusters=k,
1198
- batch_size=int(mb_batch or 4096),
1199
- random_state=0,
1200
- n_init="auto" if init is None else 1,
1201
- init="k-means++" if init is None else init
1202
- )
1203
- labels = kmeans.fit_predict(X_space)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1204
 
1205
- # Merge very-similar clusters (LSA only) stricter
1206
- if use_lsa:
1207
- labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.95)
 
 
 
1208
 
1209
- # Attach clustering back to df_main
1210
- df_main["cluster_id"] = labels
1211
- term_names = cluster_labels_pmi_bigram(texts, labels, subjects=subjects_only, topn=6, subject_alpha=0.75)
1212
- df_main["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
1213
- df_main["anomaly_score"] = anomaly_scores
1214
 
1215
- # Newsletter/newswire rows: assign special cluster
1216
- if len(df_news):
1217
- df_news["cluster_id"] = -1
1218
- df_news["cluster_name"] = "newsletter/news"
1219
- df_news["anomaly_score"] = np.nan
1220
 
1221
- # System/notification rows: assign special cluster
 
 
 
 
 
 
 
 
 
 
 
 
 
1222
  if len(df_alerts):
1223
- df_alerts["cluster_id"] = -2
1224
- df_alerts["cluster_name"] = "system/alerts"
1225
- df_alerts["anomaly_score"] = np.nan
1226
 
1227
  # Combine back
1228
  df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
1229
 
1230
- # CorruptionScore (uses trusted domains)
1231
  df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
1232
 
1233
- # Build search index on df_main only
1234
- use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
1235
  index_obj = None
1236
  if use_faiss:
1237
- d = X_reduced.shape[1]
1238
- index_obj = faiss.IndexFlatIP(d) # cosine ~ inner product on normalized vectors
1239
- index_obj.add(X_reduced)
1240
  else:
1241
- nn = NearestNeighbors(metric="cosine", algorithm="brute")
1242
- nn.fit(X_space)
1243
- index_obj = nn
1244
-
 
 
 
 
 
 
 
 
 
 
 
 
 
1245
  # Summaries
1246
  cluster_counts = (
1247
- df[(df["cluster_id"] != -1) & (df["cluster_id"] != -2)]
1248
  .groupby(["cluster_id", "cluster_name"]).size()
1249
  .reset_index(name="count")
1250
  .sort_values("count", ascending=False)
@@ -1254,11 +1546,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1254
  # Append buckets
1255
  news_count = int((df["cluster_id"] == -1).sum())
1256
  alerts_count = int((df["cluster_id"] == -2).sum())
 
1257
  extra_rows = []
1258
  if news_count > 0:
1259
  extra_rows.append({"cluster_id": -1, "cluster_name": "newsletter/news", "count": news_count})
1260
  if alerts_count > 0:
1261
  extra_rows.append({"cluster_id": -2, "cluster_name": "system/alerts", "count": alerts_count})
 
 
1262
  if extra_rows:
1263
  cluster_counts = pd.concat([cluster_counts, pd.DataFrame(extra_rows)], ignore_index=True)
1264
 
@@ -1283,6 +1578,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1283
  )
1284
  sender_choices = ["(any)"] + sender_counts["from_email"].tolist()
1285
 
 
1286
  # Languages present
1287
  langs = [l for l in sorted(df["lang"].dropna().unique()) if l and l!="unknown"]
1288
  lang_choices = ["(any)"] + langs
@@ -1304,13 +1600,21 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1304
  cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
1305
  out_table = show_df[cols_out].head(500)
1306
 
1307
- vec_state = {"count_vec": count_vec, "char_vec": char_vec, "bm25": bm25}
 
 
 
 
 
 
 
 
1308
 
1309
  status_md = (
1310
  f"**Processed {len(df):,} emails** \n"
1311
- f"Word feats (BM25): {d_word:,} | Char feats: {d_char:,} (x0.20) | Total: {d_full:,} \n"
1312
- f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
1313
- f"k = {k} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'} | "
1314
  f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
1315
  )
1316
 
@@ -1320,27 +1624,36 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1320
  domain_update = gr.update(choices=domain_choices, value="(any)")
1321
  sender_update = gr.update(choices=sender_choices, value="(any)")
1322
  lang_update = gr.update(choices=lang_choices, value="(any)")
 
 
 
 
1323
 
1324
  return (
1325
  status_md,
1326
- cluster_counts, domain_counts,
1327
  actors, offhours_table,
1328
  out_table,
1329
- df, vec_state, (X_reduced if use_lsa else None), index_obj, term_names,
1330
- use_lsa, bool(use_faiss),
1331
  cluster_update, domain_update, sender_update, lang_update,
1332
- svd_obj, norm_obj,
1333
- (d_word, d_char),
1334
  extra_terms_lower, bool(highlight_toggle)
1335
  )
1336
 
1337
  (run_btn.click)(
1338
  process_file,
1339
- inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
1340
- use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
1341
- trusted_domains_in, extra_keywords_in, highlight_toggle],
 
 
 
 
 
1342
  outputs=[status,
1343
- cluster_counts_df, domain_counts_df,
1344
  actors_df, offhours_df,
1345
  results_df,
1346
  state_df, state_vec, state_X_reduced, state_index, state_term_names,
@@ -1371,6 +1684,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1371
  tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
1372
  tmp = tmp.drop(columns=["_dt"])
1373
  cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
 
 
1374
  acc = [c for c in cols_out if c in tmp.columns]
1375
  return tmp[acc].head(500)
1376
 
@@ -1416,40 +1731,75 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1416
  except Exception:
1417
  return None
1418
 
1419
- def _vectorize_query(q: str, vec_state: Dict[str, Any]):
1420
- count_vec = vec_state["count_vec"]
1421
- char_vec = vec_state["char_vec"]
1422
- bm25 = vec_state["bm25"]
1423
- q_word_tf = count_vec.transform([q])
1424
- q_word = bm25.transform(q_word_tf)
1425
- q_char = char_vec.transform([q])
1426
- q_full = hstack([q_word, q_char], format="csr")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1427
  return q_full
1428
 
1429
  def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
1430
  if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
1431
  return pd.DataFrame(), []
 
 
 
 
 
 
 
1432
  q_terms = _tokenize_query(q)
1433
- q_vec_full = _vectorize_query(q, vec_state)
1434
- if use_lsa_flag and (X_reduced is not None):
 
1435
  q_emb = _project_query_to_lsa(q_vec_full, svd_obj, norm_obj)
1436
  if q_emb is None:
1437
  return pd.DataFrame(), q_terms
1438
  else:
1439
  q_emb = q_vec_full
1440
-
1441
- # align with df_main order (exclude -1 and -2)
1442
- mask = ~df["cluster_id"].isin([-1, -2])
1443
- filtered_df = df[mask]
1444
-
1445
  if isinstance(index_obj, NearestNeighbors):
1446
- distances, indices = index_obj.kneighbors(q_emb, n_neighbors=min(50, len(filtered_df)))
 
 
 
1447
  inds = indices[0]
1448
  sims = 1.0 - distances[0]
1449
  results = filtered_df.iloc[inds].copy()
1450
  results["search_score"] = sims
1451
  elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
1452
- D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(filtered_df)))
1453
  inds = I[0]
1454
  sims = D[0]
1455
  results = filtered_df.iloc[inds].copy()
@@ -1459,8 +1809,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1459
 
1460
  # blend with corruption score lightly
1461
  cs = results["corruption_score"].fillna(0.0)
1462
- cs = (cs - cs.min()) / (cs.max() - cs.min() + 1e-9)
1463
- results["_blend"] = 0.7*results["search_score"].values + 0.3*cs.values
1464
  # sort UI-selected way
1465
  if sort_by == "search_score":
1466
  results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
@@ -1471,7 +1821,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1471
  results = results.sort_values("_blend", ascending=(sort_dir=="asc"))
1472
  results = results.drop(columns=["_blend"])
1473
  cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score", "search_score"]
1474
- return results[cols].head(50), q_terms
1475
 
1476
  search_btn.click(
1477
  search_fn,
@@ -1499,12 +1849,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1499
  if dstr is not None:
1500
  cand = cand[cand["date"] == dstr]
1501
  if len(cand) == 0:
 
1502
  cand = df[df["subject"] == sel.get("subject", "")]
1503
  if len(cand) == 0:
1504
- return ""
1505
  row = cand.iloc[0]
1506
- cid = int(row.get("cluster_id", -1))
1507
- clabel = term_names.get(cid, f"cluster_{cid}") if term_names else None
1508
  return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel, do_highlight=bool(do_highlight), extra_terms=extra_terms)
1509
 
1510
  results_df.select(
@@ -1523,6 +1874,17 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1523
  return "(any)"
1524
  label = df_sum.iloc[row_idx]["label"]
1525
  return label if isinstance(label, str) else "(any)"
 
 
 
 
 
 
 
 
 
 
 
1526
 
1527
  cluster_counts_df.select(
1528
  on_cluster_click,
@@ -1533,6 +1895,31 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1533
  inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
1534
  outputs=[results_df]
1535
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1536
 
1537
  if __name__ == "__main__":
1538
- demo.launch()
 
25
  from sklearn.preprocessing import normalize as sk_normalize
26
  from sklearn.metrics.pairwise import cosine_similarity
27
 
28
+ # === NEW / UPDATED IMPORTS ===
29
+ from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer # NEW
30
+ from scipy.sparse import csr_matrix # NEW (to mix dense embeddings with sparse)
31
+ try:
32
+ import hdbscan # OPTIONAL (pip install hdbscan)
33
+ HDBSCAN_OK = True
34
+ except Exception:
35
+ HDBSCAN_OK = False
36
+
37
+ # Optional tiny/fast word vectors via Gensim (local .txt/.vec/.bin)
38
+ try:
39
+ from gensim.models import KeyedVectors # OPTIONAL
40
+ GENSIM_OK = True
41
+ except Exception:
42
+ GENSIM_OK = False
43
+
44
  # Optional light anomaly detection
45
  try:
46
  from sklearn.ensemble import IsolationForest
 
587
  )
588
  return html
589
 
590
+ # ---------- Lightweight Embedding Utilities (Optional) ----------
591
+ def _load_embeddings(emb_path: str, binary: bool):
592
+ """
593
+ Load word vectors with Gensim if available.
594
+ Accepts word2vec binary (.bin) or text formats (.txt/.vec).
595
+ Returns (model, dim) or (None, 0) if not available.
596
+ """
597
+ if not GENSIM_OK or not emb_path or not os.path.exists(emb_path):
598
+ return None, 0
599
+ try:
600
+ if binary:
601
+ kv = KeyedVectors.load_word2vec_format(emb_path, binary=True)
602
+ else:
603
+ kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=False)
604
+ return kv, int(kv.vector_size)
605
+ except Exception:
606
+ # Attempt GloVe-like with headerless text
607
+ try:
608
+ kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=True)
609
+ return kv, int(kv.vector_size)
610
+ except Exception:
611
+ return None, 0
612
+
613
+ def _avg_embed_for_text(text: str, kv, dim: int) -> np.ndarray:
614
+ """
615
+ Average embeddings over tokens matched by TOKEN_PATTERN.
616
+ Returns zero vector if nothing matches or kv is None.
617
+ """
618
+ vec = np.zeros((dim,), dtype=np.float32)
619
+ if not kv or not text:
620
+ return vec
621
+ toks = re.findall(TOKEN_PATTERN, text.lower())
622
+ cnt = 0
623
+ for t in toks:
624
+ if t in kv:
625
+ vec += kv[t]
626
+ cnt += 1
627
+ if cnt > 0:
628
+ vec /= float(cnt)
629
+ # L2-normalize
630
+ n = np.linalg.norm(vec)
631
+ if n > 0:
632
+ vec /= n
633
+ return vec
634
+
635
+ def _build_doc_embeddings(texts: List[str], kv, dim: int) -> np.ndarray:
636
+ """
637
+ Build [n_docs, dim] dense matrix of averaged embeddings.
638
+ """
639
+ if not kv or dim <= 0:
640
+ return np.zeros((len(texts), 0), dtype=np.float32)
641
+ out = np.zeros((len(texts), dim), dtype=np.float32)
642
+ for i, t in enumerate(texts):
643
+ out[i, :] = _avg_embed_for_text(t or "", kv, dim)
644
+ return out
645
+
646
  # =================== Feature engineering (BM25 + char) ===================
647
  class BM25Transformer:
648
  def __init__(self, k1=1.2, b=0.75):
 
687
  return (t + " " + " ".join(tokens)).strip()
688
 
689
  # =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST ===================
690
+ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alpha=0.75, global_ubiq_cut=0.20):
691
  """
692
+ Improved labeler:
693
+ - Considers bigrams AND trigrams (PMI vs. global)
694
+ - Class-TFIDF unigrams with subject coverage boost
695
+ - Suppresses globally ubiquitous tokens/phrases (appear in >20% docs by default)
 
 
696
  """
697
  import math as _math
698
  from collections import Counter, defaultdict
699
  from sklearn.feature_extraction.text import TfidfVectorizer
700
 
701
+ HEADER_STOP = {"subject","re","fw","fwd","to","cc","bcc","from","sent","forwarded",
702
+ "回复","主题","收件人","发件人"}
703
 
704
  def is_junk_token(tok: str) -> bool:
705
  if _is_junk_term(tok): return True
706
  tl = tok.lower()
707
+ if tl.startswith("__"): return True
708
+ if tl in STOP_TERMS: return True
709
  if tl in HEADER_STOP: return True
710
  if "@" in tl: return True
 
711
  if tl.isascii() and len(tl) <= 2: return True
712
+ if re.search(r"[^\w\-']", tl) and "’" not in tl and "'" not in tl: return True
 
 
 
713
  return False
714
 
715
  def tokenize_clean(t):
716
+ toks = re.findall(TOKEN_PATTERN, (t or "").lower())
717
  return [w for w in toks if not is_junk_token(w)]
718
 
719
+ def ngrams(toks, n):
720
+ return [" ".join(p) for p in zip(*[toks[i:] for i in range(n)]) if all(not is_junk_token(x) for x in p)]
721
 
722
+ # Compute global doc frequency for tokens, bigrams, trigrams
723
+ glob_df_uni = Counter()
724
+ glob_df_bg = Counter()
725
+ glob_df_tri = Counter()
726
  per_c_bg = defaultdict(Counter)
727
+ per_c_tri = defaultdict(Counter)
728
  per_c_texts = defaultdict(list)
729
  per_c_doc_count = defaultdict(int)
 
 
730
  per_c_subj_uni_docs = defaultdict(Counter)
731
  per_c_subj_bg_docs = defaultdict(Counter)
732
+ per_c_subj_tri_docs = defaultdict(Counter)
733
 
734
  have_subjects = subjects is not None and len(subjects) == len(texts)
735
 
736
+ # Pre-pass: DF stats
737
  for idx, (txt, c) in enumerate(zip(texts, labels)):
738
  c = int(c)
739
  toks = tokenize_clean(txt)
740
+ uni_set = set(toks)
741
+ bg_set = set(ngrams(toks, 2))
742
+ tri_set = set(ngrams(toks, 3))
743
+ # DF
744
+ glob_df_uni.update(uni_set)
745
+ glob_df_bg.update(bg_set)
746
+ glob_df_tri.update(tri_set)
747
+ # Per-cluster counts
748
+ per_c_bg[c].update(bg_set)
749
+ per_c_tri[c].update(tri_set)
750
  per_c_texts[c].append(" ".join(toks))
751
  per_c_doc_count[c] += 1
752
+ # Subject presence
753
  if have_subjects:
754
+ stoks = tokenize_clean(subjects[idx] or "")
755
+ s_uni = set(stoks)
756
+ s_bg = set(ngrams(stoks, 2))
757
+ s_tri = set(ngrams(stoks, 3))
758
+ per_c_subj_uni_docs[c].update(s_uni)
759
+ per_c_subj_bg_docs[c].update(s_bg)
760
+ per_c_subj_tri_docs[c].update(s_tri)
761
+
762
+ N = max(1, len(texts))
763
  labels_out = {}
764
+
765
+ # Helper: ubiquity filter
766
+ def too_ubiquitous(df_count): # fraction of docs
767
+ return (df_count / float(N)) > float(global_ubiq_cut)
768
 
769
  for c in sorted(set(int(x) for x in labels)):
770
  n_docs_c = max(1, per_c_doc_count[c])
771
 
772
+ # PMI bigrams & trigrams with subject-coverage boost
773
+ phrases = []
774
+ for store, glob_df, subj_docs, n in (
775
+ (per_c_bg[c], glob_df_bg, per_c_subj_bg_docs[c], 2),
776
+ (per_c_tri[c], glob_df_tri, per_c_subj_tri_docs[c], 3),
777
+ ):
778
+ total_c = sum(store.values()) + 1e-12
779
+ total_g = sum(glob_df.values()) + 1e-12
780
+ scored = []
781
+ for ng, cnt in store.most_common(3000):
782
+ if too_ubiquitous(glob_df[ng]): # skip ubiquitous n-grams
783
+ continue
784
+ p_ng_c = cnt / total_c
785
+ p_ng_g = (glob_df[ng] / total_g)
786
+ if p_ng_c > 0 and p_ng_g > 0:
787
+ score = _math.log(p_ng_c) - _math.log(p_ng_g)
788
+ cov = 0.0
789
+ if have_subjects:
790
+ cov = subj_docs[ng] / n_docs_c
791
+ score += subject_alpha * cov
792
+ scored.append((score, ng))
793
+ scored.sort(reverse=True)
794
+ # take a couple from each class to avoid only-bigrams/only-trigrams
795
+ take = max(1, topn // (3 if n == 3 else 2))
796
+ phrases.extend([p for _, p in scored[:take]])
797
+
798
+ # Class-TFIDF unigrams with subject coverage boost
799
  docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
800
+ docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
801
  corpus = [docs_c[0], docs_bg[0]]
802
  vec = TfidfVectorizer(
803
  analyzer="word", ngram_range=(1,1),
 
807
  vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
808
  row = X[0].toarray().ravel()
809
 
810
+ # Subject coverage vector
811
  subj_cov = np.zeros_like(row)
812
  if have_subjects:
813
  vocab_index = {t:i for i,t in enumerate(vocab)}
814
  for tok, cnt_docs in per_c_subj_uni_docs[c].items():
815
+ if tok in vocab_index and not is_junk_token(tok):
816
+ subj_cov[vocab_index[tok]] = cnt_docs / n_docs_c
817
+
818
+ row_boosted = row + subject_alpha * subj_cov
819
+ order = row_boosted.argsort()[::-1]
820
+ unis = []
821
+ for i in order:
 
 
 
 
 
822
  tok = vocab[i]
823
+ if is_junk_token(tok): continue
824
+ if too_ubiquitous(glob_df_uni.get(tok, 0)): # suppress ubiquitous tokens
825
+ continue
826
+ unis.append(tok)
827
+ if len(unis) >= max(0, topn - len(phrases)):
828
  break
829
 
830
+ parts = (phrases + unis)[:max(2, topn)]
831
  labels_out[c] = ", ".join(parts) if parts else f"cluster_{c}"
832
 
833
  return labels_out
 
988
  max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
989
  use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
990
  skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
991
+ with gr.Row():
992
+ use_hashing = gr.Checkbox(label="Use HashingVectorizer (memory-light, fast)", value=True) # NEW
993
+ hash_bits = gr.Slider(label="Hashing bits (2^n features)", minimum=16, maximum=20, step=1, value=18) # NEW
994
  with gr.Row():
995
  use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
996
  lsa_dim = gr.Number(label="LSA components", value=256, precision=0) # richer default
997
  auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
998
  k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
999
  mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
1000
+ with gr.Row():
1001
+ use_hdbscan = gr.Checkbox(label="Use HDBSCAN (auto-k, noise) on reduced vectors", value=False) # NEW
1002
+ hdb_min_cluster = gr.Number(label="HDBSCAN min_cluster_size", value=60, precision=0) # NEW
1003
+ hdb_min_samples = gr.Number(label="HDBSCAN min_samples (0=auto)", value=0, precision=0) # NEW
1004
+ with gr.Row():
1005
+ per_language = gr.Checkbox(label="Cluster per language (reduces cross-language mixing)", value=True) # NEW
1006
  with gr.Row():
1007
  use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
1008
  use_iso = gr.Checkbox(label="Compute anomaly score (IsolationForest on LSA)", value=False)
 
1012
  trusted_domains_in = gr.Textbox(label="Trusted org domains (comma-separated)", value="example.gov, example.org")
1013
  extra_keywords_in = gr.Textbox(label="Extra suspicious phrases (comma-separated)", value="")
1014
  highlight_toggle = gr.Checkbox(label="Highlight suspect patterns in reader", value=True)
1015
+ with gr.Row():
1016
+ use_embeddings = gr.Checkbox(label="Add lightweight word embeddings (avg word2vec/GloVe) if available", value=False) # NEW
1017
+ embed_weight = gr.Slider(label="Embedding weight in feature space", minimum=0.0, maximum=1.0, step=0.05, value=0.35) # NEW
1018
+ with gr.Row():
1019
+ embeddings_path = gr.Textbox(label="Path to local embeddings (.txt/.vec/.bin) (optional)", value="") # NEW
1020
+ embeddings_binary = gr.Checkbox(label="File is binary word2vec format", value=False) # NEW
1021
  with gr.Row():
1022
  cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
1023
  domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
 
1039
  with gr.Row():
1040
  cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
1041
  domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
1042
+
1043
+ with gr.Row():
1044
+ sender_counts_df = gr.Dataframe(label="Top senders", interactive=False, wrap=True) # NEW
1045
 
1046
  with gr.Row():
1047
  actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
 
1111
  ) -> pd.DataFrame:
1112
  out = df
1113
  if cluster and cluster != "(any)":
1114
+ m = re.match(r"^(\-?\d+)\s+—", cluster)
1115
  if m:
1116
  cid = int(m.group(1))
1117
  out = out[out["cluster_id"] == cid]
 
1162
  # -------- Main pipeline --------
1163
  def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
1164
  use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
1165
+ trusted_domains_in, extra_keywords_in, highlight_toggle,
1166
+ # NEW:
1167
+ use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
1168
+ per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary):
1169
  if inbox_file is None:
1170
  return ("**Please upload a file.**",
1171
+ None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1172
  None, None, None, None)
1173
 
1174
+ # === Vectorization & Clustering (UPGRADED) ===
1175
+ def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
1176
+ # “texts” feed vectorizers; “subjects_only” helps labels
1177
+ texts = list(df_in.apply(enrich_text, axis=1))
1178
+ subjects_only = list(df_in["subject"].fillna(""))
1179
+ return texts, subjects_only
1180
+
1181
+ def _vectorize_block(
1182
+ texts: List[str],
1183
+ use_bigrams: bool,
1184
+ max_features: int,
1185
+ min_df: int,
1186
+ max_df: float,
1187
+ use_hashing: bool,
1188
+ hash_bits: int
1189
+ ):
1190
+ """
1191
+ Return (X_full_csr, count_vec, char_vec, bm25, d_word, d_char, d_full)
1192
+ Uses Count+BM25 (+ char-tfidf) or Hashing+TfidfTransformer (+ char-tfidf).
1193
+ """
1194
+ ngram_range = (1, 2) if use_bigrams else (1, 1)
1195
+
1196
+ if use_hashing:
1197
+ # HashingVectorizer -> counts -> TfidfTransformer (as IDF) -> BM25-like not exact.
1198
+ # We’ll keep TF-IDF for words (IDF approximates) + char-grams TF-IDF (unchanged).
1199
+ hv = HashingVectorizer(
1200
+ analyzer="word",
1201
+ ngram_range=ngram_range,
1202
+ n_features=2 ** int(hash_bits),
1203
+ alternate_sign=False, # keep positivity
1204
+ token_pattern=TOKEN_PATTERN,
1205
+ lowercase=True,
1206
+ norm=None # raw counts first
1207
+ )
1208
+ word_counts = hv.transform(texts) # CSR
1209
+ # IDF (approximate TF-IDF since we lack exact DF per token str; good enough in practice here)
1210
+ tfidf_tr = TfidfTransformer()
1211
+ X_word = tfidf_tr.fit_transform(word_counts).astype(np.float32)
1212
+
1213
+ char_vec = CharTfidf(
1214
+ analyzer="char", ngram_range=(3, 5),
1215
+ min_df=2, max_features=100_000, lowercase=True, dtype=np.float32
1216
+ )
1217
+ X_char = char_vec.fit_transform(texts)
1218
+ X_full = hstack([X_word, X_char * 0.20], format="csr")
1219
+ d_word, d_char, d_full = X_word.shape[1], X_char.shape[1], X_word.shape[1] + X_char.shape[1]
1220
+ # For downstream code compatibility, expose “count_vec” and “bm25” placeholders
1221
+ count_vec = None
1222
+ bm25 = None
1223
+ return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full
1224
+
1225
+ # Original Count -> BM25 + char
1226
+ count_vec = CountVectorizer(
1227
+ analyzer="word", ngram_range=ngram_range,
1228
+ max_features=int(max_features) if max_features else None,
1229
+ min_df=int(min_df) if min_df else 2,
1230
+ max_df=float(max_df) if max_df else 0.7,
1231
+ token_pattern=TOKEN_PATTERN, lowercase=True,
1232
+ dtype=np.float32, stop_words=STOPWORD_FOR_VEC
1233
+ )
1234
+ TF = count_vec.fit_transform(texts)
1235
+ bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
1236
+ X_word = bm25.transform(TF)
1237
+
1238
+ char_vec = CharTfidf(
1239
+ analyzer="char", ngram_range=(3, 5),
1240
+ min_df=2, max_features=100_000, lowercase=True, dtype=np.float32
1241
+ )
1242
+ X_char = char_vec.fit_transform(texts)
1243
+ X_full = hstack([X_word, X_char * 0.20], format="csr")
1244
+ d_word, d_char, d_full = X_word.shape[1], X_char.shape[1], X_word.shape[1] + X_char.shape[1]
1245
+ return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full
1246
+
1247
+ def _reduce_space(X_full, use_lsa, lsa_dim):
1248
+ svd_obj = None
1249
+ norm_obj = None
1250
+ X_reduced = None
1251
+ if use_lsa:
1252
+ svd_obj = TruncatedSVD(n_components=int(lsa_dim or 256), random_state=0)
1253
+ Xtmp = svd_obj.fit_transform(X_full) # dense
1254
+ norm_obj = Normalizer(copy=False)
1255
+ X_reduced = norm_obj.fit_transform(Xtmp).astype(np.float32)
1256
+ del Xtmp; gc.collect()
1257
+ return X_reduced, svd_obj, norm_obj
1258
+
1259
+ def _attach_embeddings(texts, X_reduced_or_full, use_lsa, kv, emb_dim, weight):
1260
+ """
1261
+ Concatenate averaged word vectors (dense) into the current space.
1262
+ We convert dense embeddings to CSR to safely hstack with CSR (if using non-LSA path).
1263
+ """
1264
+ if kv is None or emb_dim <= 0 or weight <= 0.0:
1265
+ return X_reduced_or_full, emb_dim
1266
+ doc_embs = _build_doc_embeddings(texts, kv, emb_dim).astype(np.float32)
1267
+ if weight != 1.0:
1268
+ doc_embs *= float(weight)
1269
+ if isinstance(X_reduced_or_full, np.ndarray):
1270
+ # LSA path (dense): concat dense to dense
1271
+ return np.hstack([X_reduced_or_full, doc_embs]).astype(np.float32), emb_dim
1272
+ else:
1273
+ # Sparse path: convert embeddings to CSR and hstack
1274
+ X_emb = csr_matrix(doc_embs)
1275
+ return hstack([X_reduced_or_full, X_emb], format="csr"), emb_dim
1276
+
1277
+ def _cluster_space(
1278
+ X_space,
1279
+ df_part: pd.DataFrame,
1280
+ use_lsa: bool,
1281
+ use_hdbscan: bool,
1282
+ hdb_min_cluster: int,
1283
+ hdb_min_samples: int,
1284
+ auto_k: bool,
1285
+ k_clusters: int,
1286
+ mb_batch: int,
1287
+ count_vec,
1288
+ svd_obj,
1289
+ norm_obj,
1290
+ d_word, d_char
1291
+ ):
1292
+ """
1293
+ Run HDBSCAN (if requested and available) or MiniBatchKMeans.
1294
+ Return: labels (np.array), centers (np.ndarray) or None if HDBSCAN, and chosen_k.
1295
+ """
1296
+ if use_hdbscan and HDBSCAN_OK and isinstance(X_space, np.ndarray) and X_space.shape[0] >= max(50, hdb_min_cluster):
1297
+ # HDBSCAN on LSA (dense) only
1298
+ min_samples = None if int(hdb_min_samples or 0) <= 0 else int(hdb_min_samples)
1299
+ clusterer = hdbscan.HDBSCAN(
1300
+ min_cluster_size=int(hdb_min_cluster or 60),
1301
+ min_samples=min_samples,
1302
+ metric='euclidean', # cosine≈euclidean on L2-normalized vectors
1303
+ cluster_selection_epsilon=0.0,
1304
+ core_dist_n_jobs=1 # CPU-friendly on HF
1305
+ )
1306
+ labels = clusterer.fit_predict(X_space)
1307
+ centers = None
1308
+ chosen_k = int(len(set([l for l in labels if l >= 0])))
1309
+ return labels, centers, chosen_k
1310
+
1311
+ # Otherwise MiniBatchKMeans (supports dense or sparse)
1312
+ if bool(auto_k):
1313
+ if use_lsa and isinstance(X_space, np.ndarray):
1314
+ k, _ = choose_k_by_kneedle(X_space, ks=(50,100,150,200,300,400,500))
1315
+ else:
1316
+ k = auto_k_rule(X_space.shape[0])
1317
+ else:
1318
+ k = max(10, int(k_clusters or 350))
1319
+
1320
+ init = None
1321
+ if use_lsa and isinstance(X_space, np.ndarray) and count_vec is not None:
1322
+ seeds = seeded_centroids_in_lsa(
1323
+ CORR_LEX, count_vec, svd_obj.components_, norm_obj,
1324
+ d_word=d_word, d_full=(d_word + d_char), k=k
1325
+ )
1326
+ if seeds is not None and seeds.shape[0] == k:
1327
+ init = seeds
1328
+
1329
+ kmeans = MiniBatchKMeans(
1330
+ n_clusters=k, batch_size=int(mb_batch or 4096),
1331
+ random_state=0, n_init="auto" if init is None else 1,
1332
+ init="k-means++" if init is None else init
1333
+ )
1334
+ labels = kmeans.fit_predict(X_space)
1335
+ centers = kmeans.cluster_centers_ if hasattr(kmeans, "cluster_centers_") else None
1336
+ if use_lsa and centers is not None:
1337
+ labels = merge_close_clusters(labels, centers, thresh=0.95)
1338
+ chosen_k = int(len(set(labels)))
1339
+ return labels, centers, chosen_k
1340
+
1341
 
1342
  # trusted org domains
1343
  trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
 
1348
  recs = _load_json_records(inbox_file.name)
1349
  if not recs:
1350
  return ("**No valid records found.**",
1351
+ None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1352
  None, None, None, None)
1353
 
1354
  # Normalize
1355
  normd = []
1356
  for r in tqdm(recs, desc="Normalize", leave=False):
1357
+ out = normalize_email_record(r, use_langdetect=(not bool(skip_lang)))
1358
  if out and out.get("body_text") is not None:
1359
  normd.append(out)
1360
  df = pd.DataFrame(normd)
1361
  if df.empty:
1362
  return ("**No usable email records after normalization.**",
1363
+ None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1364
  None, None, None, None)
1365
 
1366
  # Deduplicate conservatively
 
1393
  df_news = df[df["is_news"]].reset_index(drop=True)
1394
  df_alerts = df[df["is_notify"]].reset_index(drop=True)
1395
 
1396
+ # ----- Build texts (and optionally partition by language) -----
1397
+ use_lang = not bool(skip_lang)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1398
 
1399
+ # Optional embeddings
1400
+ kv = None; emb_dim = 0
1401
+ if bool(use_embeddings):
1402
+ kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
1403
+
1404
+ # Optional per-language partitioning to avoid cross-language mixing
1405
+ parts = []
1406
+ if bool(per_language) and "lang" in df_main.columns:
1407
+ for lang_code, grp in df_main.groupby("lang"):
1408
+ if lang_code in (None, "", "unknown"):
1409
+ # keep unknown together
1410
+ parts.append(("unknown", grp.reset_index(drop=True)))
1411
+ else:
1412
+ parts.append((lang_code, grp.reset_index(drop=True)))
1413
  else:
1414
+ parts = [("all", df_main.reset_index(drop=True))]
1415
+
1416
+ # Collect per-part results to concatenate
1417
+ labels_list = []
1418
+ cluster_name_list = []
1419
+ anomaly_list = []
1420
+ X_reduced_holder = None # only kept if use_lsa True and single-part; else None
1421
+ nn_index_obj = None # per overall (we build on all df_main at end)
1422
+ term_names_global = {}
1423
+
1424
+ # We'll build a global ANN/search index only if single partition & LSA on
1425
+ single_partition = (len(parts) == 1)
1426
+ d_word_agg = 0; d_char_agg=0;
1427
+ k_agg = 0
1428
+
1429
+ for p_lang, df_part in parts:
1430
+ if df_part.empty: continue
1431
+ texts, subjects_only = _make_texts(df_part)
1432
+
1433
+ # Vectorize
1434
+ X_full, count_vec, char_vec, bm25_local, d_word, d_char, d_full = _vectorize_block(
1435
+ texts=texts,
1436
+ use_bigrams=bool(use_bigrams),
1437
+ max_features=int(max_features or 120000),
1438
+ min_df=int(min_df or 3),
1439
+ max_df=float(max_df or 0.7),
1440
+ use_hashing=bool(use_hashing),
1441
+ hash_bits=int(hash_bits or 18)
1442
  )
1443
+ d_word_agg += d_word; d_char_agg += d_char
1444
+
1445
+ # Dim reduction
1446
+ X_reduced, svd_obj_local, norm_obj_local = _reduce_space(X_full, bool(use_lsa), int(lsa_dim or 256))
1447
+ X_space = (X_reduced if X_reduced is not None else X_full)
1448
+
1449
+ # Optional embeddings concat
1450
+ if kv is not None and emb_dim > 0 and float(embed_weight or 0) > 0:
1451
+ X_space, _ = _attach_embeddings(texts, X_space, bool(use_lsa), kv, emb_dim, float(embed_weight))
1452
+
1453
+ # Optional anomaly (on LSA only, before clustering)
1454
+ anomaly_scores = np.full((len(df_part),), np.nan, dtype=np.float32)
1455
+ if X_reduced is not None and bool(use_iso) and ISO_OK and X_reduced.shape[0] >= 50:
1456
+ try:
1457
+ iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
1458
+ iso.fit(X_reduced)
1459
+ anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32)
1460
+ except Exception:
1461
+ pass
1462
+
1463
+ # Cluster (HDBSCAN or MiniBatchKMeans)
1464
+ labels, centers, chosen_k = _cluster_space(
1465
+ X_space=X_space, df_part=df_part, use_lsa=bool(use_lsa),
1466
+ use_hdbscan=bool(use_hdbscan), hdb_min_cluster=int(hdb_min_cluster or 60),
1467
+ hdb_min_samples=int(hdb_min_samples or 0),
1468
+ auto_k=bool(auto_k), k_clusters=int(k_clusters or 350), mb_batch=int(mb_batch or 4096),
1469
+ count_vec=count_vec, svd_obj=svd_obj_local, norm_obj=norm_obj_local,
1470
+ d_word=d_word, d_char=d_char
1471
+ )
1472
+ k_agg += chosen_k
1473
 
1474
+ # HDBSCAN yields -1 noise; keep as-is. KMeans may have merges above.
1475
+ # Labels -> cluster names
1476
+ term_names = cluster_labels_pmi_bigram(
1477
+ texts=texts, labels=labels, subjects=subjects_only, topn=6, subject_alpha=0.75, global_ubiq_cut=0.20
1478
+ )
1479
+ term_names_global.update({int(k): v for k, v in term_names.items()})
1480
 
1481
+ labels_list.append(pd.Series(labels, index=df_part.index))
1482
+ cluster_name_list.append(pd.Series([term_names.get(int(c), f"noise_{int(c)}" if c < 0 else f"cluster_{int(c)}") for c in labels], index=df_part.index))
1483
+ anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
 
 
1484
 
1485
+ # If single partition and LSA on, we’ll build ANN on that space after loop
1486
+ if single_partition and bool(use_lsa) and X_reduced is not None and X_reduced_holder is None:
1487
+ X_reduced_holder = X_reduced
 
 
1488
 
1489
+ # Stitch part results back to df_main order
1490
+ if not labels_list: # handle case where df_main was empty
1491
+ df_main["cluster_id"] = -10
1492
+ df_main["cluster_name"] = "unclustered"
1493
+ df_main["anomaly_score"] = np.nan
1494
+ else:
1495
+ df_main = df_main.copy()
1496
+ df_main["cluster_id"] = pd.concat(labels_list).sort_index()
1497
+ df_main["cluster_name"] = pd.concat(cluster_name_list).sort_index()
1498
+ df_main["anomaly_score"] = pd.concat(anomaly_list).sort_index()
1499
+
1500
+ # Keep your special buckets
1501
+ if len(df_news):
1502
+ df_news["cluster_id"] = -1; df_news["cluster_name"] = "newsletter/news"; df_news["anomaly_score"] = np.nan
1503
  if len(df_alerts):
1504
+ df_alerts["cluster_id"] = -2; df_alerts["cluster_name"] = "system/alerts"; df_alerts["anomaly_score"] = np.nan
 
 
1505
 
1506
  # Combine back
1507
  df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
1508
 
1509
+ # Corruption score (unchanged)
1510
  df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
1511
 
1512
+ # --- Build search index (Faiss on LSA if single partition; else fallback to cosine brute) ---
1513
+ use_faiss = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
1514
  index_obj = None
1515
  if use_faiss:
1516
+ d = X_reduced_holder.shape[1]
1517
+ index_obj = faiss.IndexFlatIP(d)
1518
+ index_obj.add(X_reduced_holder)
1519
  else:
1520
+ # fallback: brute cosine on feature space of df_main again (we need a space to fit)
1521
+ # For multi-part, just use corruption/search blending without ANN speedup (NearestNeighbors over dense LSA best-effort)
1522
+ try:
1523
+ if bool(use_lsa) and X_reduced_holder is not None and single_partition:
1524
+ nn = NearestNeighbors(metric="cosine", algorithm="brute")
1525
+ nn.fit(X_reduced_holder)
1526
+ index_obj = nn
1527
+ else:
1528
+ # If multi-part or non-LSA, do a minimal brute heuristic by re-vectorizing only for search when needed.
1529
+ # We'll defer exact vectors to search_fn’s projection.
1530
+ nn = NearestNeighbors(metric="cosine", algorithm="brute")
1531
+ # Fit on a tiny placeholder so object exists; we’ll guard in search.
1532
+ nn.fit(np.zeros((1, 4), dtype=np.float32))
1533
+ index_obj = nn
1534
+ except Exception:
1535
+ index_obj = None
1536
+
1537
  # Summaries
1538
  cluster_counts = (
1539
+ df[df["cluster_id"] >= 0]
1540
  .groupby(["cluster_id", "cluster_name"]).size()
1541
  .reset_index(name="count")
1542
  .sort_values("count", ascending=False)
 
1546
  # Append buckets
1547
  news_count = int((df["cluster_id"] == -1).sum())
1548
  alerts_count = int((df["cluster_id"] == -2).sum())
1549
+ hdbscan_noise = int((df["cluster_id"] < -2).sum())
1550
  extra_rows = []
1551
  if news_count > 0:
1552
  extra_rows.append({"cluster_id": -1, "cluster_name": "newsletter/news", "count": news_count})
1553
  if alerts_count > 0:
1554
  extra_rows.append({"cluster_id": -2, "cluster_name": "system/alerts", "count": alerts_count})
1555
+ if hdbscan_noise > 0:
1556
+ extra_rows.append({"cluster_id": -3, "cluster_name": "HDBSCAN noise", "count": hdbscan_noise})
1557
  if extra_rows:
1558
  cluster_counts = pd.concat([cluster_counts, pd.DataFrame(extra_rows)], ignore_index=True)
1559
 
 
1578
  )
1579
  sender_choices = ["(any)"] + sender_counts["from_email"].tolist()
1580
 
1581
+
1582
  # Languages present
1583
  langs = [l for l in sorted(df["lang"].dropna().unique()) if l and l!="unknown"]
1584
  lang_choices = ["(any)"] + langs
 
1600
  cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
1601
  out_table = show_df[cols_out].head(500)
1602
 
1603
+ # The vectorizer state is complex now with partitioning. For search, we need to rebuild it on the fly.
1604
+ # So we store the params to do so.
1605
+ vec_state = {
1606
+ "use_hashing": bool(use_hashing), "hash_bits": int(hash_bits),
1607
+ "max_features": int(max_features), "min_df": int(min_df), "max_df": float(max_df),
1608
+ "use_bigrams": bool(use_bigrams),
1609
+ # dummy objects for search fn compatibility
1610
+ "count_vec": None, "char_vec": None, "bm25": None
1611
+ }
1612
 
1613
  status_md = (
1614
  f"**Processed {len(df):,} emails** \n"
1615
+ f"Word feats: {d_word_agg:,} | Char feats: {d_char_agg:,} (x0.20) \n"
1616
+ f"{'LSA: ' + str(X_reduced_holder.shape[1]) + ' dims | ' if X_reduced_holder is not None else ''}"
1617
+ f"k = {k_agg} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'} | "
1618
  f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
1619
  )
1620
 
 
1624
  domain_update = gr.update(choices=domain_choices, value="(any)")
1625
  sender_update = gr.update(choices=sender_choices, value="(any)")
1626
  lang_update = gr.update(choices=lang_choices, value="(any)")
1627
+
1628
+ # NOTE: svd_obj and norm_obj are now local to partitions. We can only pass back the single-partition one for search.
1629
+ svd_obj_out = svd_obj_local if single_partition and 'svd_obj_local' in locals() else None
1630
+ norm_obj_out = norm_obj_local if single_partition and 'norm_obj_local' in locals() else None
1631
 
1632
  return (
1633
  status_md,
1634
+ cluster_counts, domain_counts, sender_counts,
1635
  actors, offhours_table,
1636
  out_table,
1637
+ df, vec_state, (X_reduced_holder if bool(use_lsa) else None), index_obj, term_names_global,
1638
+ bool(use_lsa), bool(use_faiss),
1639
  cluster_update, domain_update, sender_update, lang_update,
1640
+ svd_obj_out, norm_obj_out,
1641
+ (d_word_agg, d_char_agg),
1642
  extra_terms_lower, bool(highlight_toggle)
1643
  )
1644
 
1645
  (run_btn.click)(
1646
  process_file,
1647
+ inputs=[
1648
+ inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
1649
+ use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
1650
+ trusted_domains_in, extra_keywords_in, highlight_toggle,
1651
+ # NEW:
1652
+ use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
1653
+ per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary
1654
+ ],
1655
  outputs=[status,
1656
+ cluster_counts_df, domain_counts_df, sender_counts_df,
1657
  actors_df, offhours_df,
1658
  results_df,
1659
  state_df, state_vec, state_X_reduced, state_index, state_term_names,
 
1684
  tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
1685
  tmp = tmp.drop(columns=["_dt"])
1686
  cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
1687
+ if "search_score" in tmp.columns:
1688
+ cols_out.append("search_score")
1689
  acc = [c for c in cols_out if c in tmp.columns]
1690
  return tmp[acc].head(500)
1691
 
 
1731
  except Exception:
1732
  return None
1733
 
1734
+ def _vectorize_query(q: str, vec_state: Dict[str, Any], corpus_texts_for_fit: List[str]):
1735
+ # This has to re-fit a vectorizer to project the query, since partitioning means
1736
+ # we don't have a single global one.
1737
+ if vec_state.get("use_hashing"):
1738
+ hv = HashingVectorizer(
1739
+ analyzer="word", ngram_range=(1,2) if vec_state.get('use_bigrams') else (1,1),
1740
+ n_features=2**vec_state.get('hash_bits', 18), alternate_sign=False,
1741
+ token_pattern=TOKEN_PATTERN, lowercase=True, norm=None
1742
+ )
1743
+ # fit to get the transformer state on the corpus
1744
+ word_counts = hv.fit_transform(corpus_texts_for_fit)
1745
+ tfidf_tr = TfidfTransformer().fit(word_counts)
1746
+ q_word_counts = hv.transform([q])
1747
+ q_word = tfidf_tr.transform(q_word_counts)
1748
+ else: # BM25 path
1749
+ count_vec = CountVectorizer(
1750
+ analyzer="word", ngram_range=(1,2) if vec_state.get('use_bigrams') else (1,1),
1751
+ max_features=vec_state.get('max_features'), min_df=vec_state.get('min_df'),
1752
+ max_df=vec_state.get('max_df'), token_pattern=TOKEN_PATTERN, lowercase=True,
1753
+ dtype=np.float32, stop_words=STOPWORD_FOR_VEC
1754
+ )
1755
+ TF = count_vec.fit_transform(corpus_texts_for_fit)
1756
+ bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
1757
+ q_word_tf = count_vec.transform([q])
1758
+ q_word = bm25.transform(q_word_tf)
1759
+
1760
+ char_vec = CharTfidf(
1761
+ analyzer="char", ngram_range=(3,5), min_df=2, max_features=100_000,
1762
+ lowercase=True, dtype=np.float32
1763
+ ).fit(corpus_texts_for_fit)
1764
+
1765
+ q_char = char_vec.transform([q])
1766
+ q_full = hstack([q_word, q_char * 0.20], format="csr")
1767
  return q_full
1768
 
1769
  def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
1770
  if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
1771
  return pd.DataFrame(), []
1772
+
1773
+ # align with df_main order (exclude -1 and -2)
1774
+ mask = ~df["cluster_id"].isin([-1, -2, -3])
1775
+ filtered_df = df[mask].reset_index(drop=True)
1776
+ if filtered_df.empty:
1777
+ return pd.DataFrame(), []
1778
+
1779
  q_terms = _tokenize_query(q)
1780
+ q_vec_full = _vectorize_query(q, vec_state, corpus_texts_for_fit=list(filtered_df.apply(enrich_text, axis=1)))
1781
+
1782
+ if use_lsa_flag and (X_reduced is not None) and (svd_obj is not None) and (norm_obj is not None):
1783
  q_emb = _project_query_to_lsa(q_vec_full, svd_obj, norm_obj)
1784
  if q_emb is None:
1785
  return pd.DataFrame(), q_terms
1786
  else:
1787
  q_emb = q_vec_full
1788
+
1789
+ n_neighbors = min(50, len(filtered_df))
1790
+ if n_neighbors <= 0: return pd.DataFrame(), q_terms
1791
+
 
1792
  if isinstance(index_obj, NearestNeighbors):
1793
+ # Check if index was fit on placeholder
1794
+ if hasattr(index_obj, 'n_samples_fit_') and index_obj.n_samples_fit_ <= 1:
1795
+ return pd.DataFrame(), q_terms # cannot search
1796
+ distances, indices = index_obj.kneighbors(q_emb, n_neighbors=n_neighbors)
1797
  inds = indices[0]
1798
  sims = 1.0 - distances[0]
1799
  results = filtered_df.iloc[inds].copy()
1800
  results["search_score"] = sims
1801
  elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
1802
+ D, I = index_obj.search(q_emb.astype(np.float32), k=n_neighbors)
1803
  inds = I[0]
1804
  sims = D[0]
1805
  results = filtered_df.iloc[inds].copy()
 
1809
 
1810
  # blend with corruption score lightly
1811
  cs = results["corruption_score"].fillna(0.0)
1812
+ cs_norm = (cs - cs.min()) / (cs.max() - cs.min() + 1e-9) if (cs.max() > cs.min()) else cs
1813
+ results["_blend"] = 0.7*results["search_score"].values + 0.3*cs_norm.values
1814
  # sort UI-selected way
1815
  if sort_by == "search_score":
1816
  results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
 
1821
  results = results.sort_values("_blend", ascending=(sort_dir=="asc"))
1822
  results = results.drop(columns=["_blend"])
1823
  cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score", "search_score"]
1824
+ return _sort_results(results.head(50), sort_by, sort_dir), q_terms
1825
 
1826
  search_btn.click(
1827
  search_fn,
 
1849
  if dstr is not None:
1850
  cand = cand[cand["date"] == dstr]
1851
  if len(cand) == 0:
1852
+ # Fallback to subject only if exact match fails
1853
  cand = df[df["subject"] == sel.get("subject", "")]
1854
  if len(cand) == 0:
1855
+ return f"Could not find original record for: {subj}"
1856
  row = cand.iloc[0]
1857
+ cid = int(row.get("cluster_id", -99))
1858
+ clabel = term_names.get(cid, row.get("cluster_name")) if term_names else row.get("cluster_name")
1859
  return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel, do_highlight=bool(do_highlight), extra_terms=extra_terms)
1860
 
1861
  results_df.select(
 
1874
  return "(any)"
1875
  label = df_sum.iloc[row_idx]["label"]
1876
  return label if isinstance(label, str) else "(any)"
1877
+
1878
+ # Click domain summary to filter
1879
+ def on_domain_click(evt: gr.SelectData, df_sum: pd.DataFrame):
1880
+ try:
1881
+ row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
1882
+ except Exception:
1883
+ row_idx = evt.index if hasattr(evt, "index") else None
1884
+ if row_idx is None or df_sum is None or len(df_sum)==0:
1885
+ return "(any)"
1886
+ val = df_sum.iloc[row_idx]["from_domain"]
1887
+ return val if isinstance(val, str) and val else "(any)"
1888
 
1889
  cluster_counts_df.select(
1890
  on_cluster_click,
 
1895
  inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
1896
  outputs=[results_df]
1897
  )
1898
+
1899
+ domain_counts_df.select(on_domain_click, inputs=[domain_counts_df], outputs=[domain_drop]) \
1900
+ .then(
1901
+ refresh_results,
1902
+ inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
1903
+ outputs=[results_df]
1904
+ )
1905
+
1906
+ def on_sender_click(evt: gr.SelectData, df_sum: pd.DataFrame):
1907
+ try:
1908
+ row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
1909
+ except Exception:
1910
+ row_idx = evt.index if hasattr(evt, "index") else None
1911
+ if row_idx is None or df_sum is None or len(df_sum)==0:
1912
+ return "(any)"
1913
+ val = df_sum.iloc[row_idx]["from_email"]
1914
+ return val if isinstance(val, str) and val else "(any)"
1915
+
1916
+ sender_counts_df.select(on_sender_click, inputs=[sender_counts_df], outputs=[sender_drop]) \
1917
+ .then(
1918
+ refresh_results,
1919
+ inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
1920
+ outputs=[results_df]
1921
+ )
1922
+
1923
 
1924
  if __name__ == "__main__":
1925
+ demo.launch()