wuhp commited on
Commit
abfac2f
·
verified ·
1 Parent(s): 3e01db4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -58
app.py CHANGED
@@ -227,17 +227,28 @@ YEAR_RE = re.compile(r"^(19|20)\d{2}$")
227
  NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
228
  ONE_CHAR_RE = re.compile(r"^.$")
229
 
 
 
 
 
 
 
230
  # This stoplist is used by the CountVectorizer (MUST be list for sklearn)
231
  STOPWORD_FOR_VEC = sorted(EN_STOP | HE_STOP | STOP_TERMS)
232
 
233
  def _is_junk_term(t: str) -> bool:
234
- tl = t.lower()
235
- if tl in STOP_TERMS or tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
236
- return True
237
  if EMAIL_LIKE_RE.search(tl): return True
238
  if YEAR_RE.match(tl): return True
239
  if NUMERIC_RE.match(tl): return True
240
  if ONE_CHAR_RE.match(tl): return True
 
 
 
 
 
241
  return False
242
 
243
  def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
@@ -686,13 +697,22 @@ def enrich_text(row: pd.Series) -> str:
686
  tokens.append(lang_tok)
687
  return (t + " " + " ".join(tokens)).strip()
688
 
689
- # =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST ===================
690
- def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alpha=0.75, global_ubiq_cut=0.20):
 
 
 
 
 
 
 
 
691
  """
692
  Improved labeler:
693
  - Considers bigrams AND trigrams (PMI vs. global)
694
  - Class-TFIDF unigrams with subject coverage boost
695
  - Suppresses globally ubiquitous tokens/phrases (appear in >20% docs by default)
 
696
  """
697
  import math as _math
698
  from collections import Counter, defaultdict
@@ -704,12 +724,13 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
704
  def is_junk_token(tok: str) -> bool:
705
  if _is_junk_term(tok): return True
706
  tl = tok.lower()
707
- if tl.startswith("__"): return True
708
- if tl in STOP_TERMS: return True
709
- if tl in HEADER_STOP: return True
710
  if "@" in tl: return True
711
  if tl.isascii() and len(tl) <= 2: return True
712
- if re.search(r"[^\w\-']", tl) and "’" not in tl and "'" not in tl: return True
 
 
 
713
  return False
714
 
715
  def tokenize_clean(t):
@@ -769,7 +790,7 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
769
  for c in sorted(set(int(x) for x in labels)):
770
  n_docs_c = max(1, per_c_doc_count[c])
771
 
772
- # PMI bigrams & trigrams with subject-coverage boost
773
  phrases = []
774
  for store, glob_df, subj_docs, n in (
775
  (per_c_bg[c], glob_df_bg, per_c_subj_bg_docs[c], 2),
@@ -788,14 +809,17 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
788
  cov = 0.0
789
  if have_subjects:
790
  cov = subj_docs[ng] / n_docs_c
 
 
 
791
  score += subject_alpha * cov
792
- scored.append((score, ng))
793
- scored.sort(reverse=True)
794
- # take a couple from each class to avoid only-bigrams/only-trigrams
795
  take = max(1, topn // (3 if n == 3 else 2))
796
- phrases.extend([p for _, p in scored[:take]])
797
 
798
- # Class-TFIDF unigrams with subject coverage boost
799
  docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
800
  docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
801
  corpus = [docs_c[0], docs_bg[0]]
@@ -807,16 +831,24 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
807
  vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
808
  row = X[0].toarray().ravel()
809
 
810
- # Subject coverage vector
811
  subj_cov = np.zeros_like(row)
 
 
812
  if have_subjects:
813
- vocab_index = {t:i for i,t in enumerate(vocab)}
814
  for tok, cnt_docs in per_c_subj_uni_docs[c].items():
815
  if tok in vocab_index and not is_junk_token(tok):
816
- subj_cov[vocab_index[tok]] = cnt_docs / n_docs_c
 
 
 
817
 
 
818
  row_boosted = row + subject_alpha * subj_cov
819
- order = row_boosted.argsort()[::-1]
 
 
 
 
820
  unis = []
821
  for i in order:
822
  tok = vocab[i]
@@ -924,13 +956,95 @@ def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVect
924
  return seeds_red
925
  return None
926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
927
  # =================== Scoring & Flags ===================
928
  def _hour_of(dt_iso: str) -> Optional[int]:
929
  try:
930
  if not dt_iso: return None
931
  dt = pd.to_datetime(dt_iso, utc=True, errors="coerce")
932
  if pd.isna(dt): return None
933
- # treat UTC for lack of per-user tz; still useful as "odd hour"
934
  return int(dt.hour)
935
  except Exception:
936
  return None
@@ -960,11 +1074,9 @@ def corruption_score(row, trusted_domains: set):
960
  body_len = len(row.get("body_text",""))
961
  if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
962
  score += 0.5
963
- # personal/off-channel via headers
964
  fd = (row.get("from_domain") or "").lower()
965
  if fd in PERSONAL_DOMAINS and fd not in trusted_domains:
966
  score += 0.5
967
- # odd hours
968
  h = _hour_of(row.get("date") or "")
969
  if h is not None and (h < 6 or h > 22):
970
  score += 0.3
@@ -1044,6 +1156,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1044
  date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
1045
  sort_by = gr.Dropdown(label="Sort by", choices=["corruption_score","date","anomaly_score","search_score"], value="corruption_score")
1046
  sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
 
 
1047
 
1048
  with gr.Row():
1049
  run_btn = gr.Button("Process", variant="primary")
@@ -1121,6 +1235,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1121
  tag_value: str,
1122
  start: str,
1123
  end: str,
 
1124
  ) -> pd.DataFrame:
1125
  out = df
1126
  if cluster and cluster != "(any)":
@@ -1151,11 +1266,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1151
  out = out[pd.to_datetime(out["date"], utc=True, errors="coerce") <= dt]
1152
  except Exception:
1153
  pass
 
 
1154
  return out
1155
 
1156
  # -------- Simple social network stats --------
1157
  def social_stats(df: pd.DataFrame) -> pd.DataFrame:
1158
- # degree = unique counterparts per address (from <-> each to/cc)
1159
  deg = {}
1160
  def add_edge(a,b):
1161
  if not a or not b or a==b: return
@@ -1186,7 +1302,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1186
 
1187
  # === Vectorization & Clustering (UPGRADED) ===
1188
  def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
1189
- # “texts” feed vectorizers; “subjects_only” helps labels
1190
  texts = list(df_in.apply(enrich_text, axis=1))
1191
  subjects_only = list(df_in["subject"].fillna(""))
1192
  return texts, subjects_only
@@ -1228,7 +1343,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1228
  count_vec = None; bm25 = None
1229
  return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full
1230
 
1231
- # Original Count -> BM25 + char
1232
  count_vec = CountVectorizer(
1233
  analyzer="word", ngram_range=ngram_range,
1234
  max_features=int(max_features) if max_features else None,
@@ -1258,12 +1372,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1258
 
1259
  n_docs = X_full.shape[0]
1260
  n_feats = X_full.shape[1]
1261
- # valid bound for TruncatedSVD: 1 <= n_components < min(n_docs, n_feats)
1262
  max_components = max(1, min(n_docs, n_feats) - 1)
1263
  n_comp = int(min(int(lsa_dim or 256), max_components))
1264
 
1265
  if n_comp < 2:
1266
- # too small to reduce meaningfully — skip LSA
1267
  return None, None, None
1268
 
1269
  svd_obj = TruncatedSVD(n_components=n_comp, random_state=0)
@@ -1274,9 +1386,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1274
  return X_reduced, svd_obj, norm_obj
1275
 
1276
  def _attach_embeddings(texts, X_reduced_or_full, use_lsa, kv, emb_dim, weight):
1277
- """
1278
- Concatenate averaged word vectors (dense) into the current space.
1279
- """
1280
  if kv is None or emb_dim <= 0 or weight <= 0.0:
1281
  return X_reduced_or_full, emb_dim
1282
  doc_embs = _build_doc_embeddings(texts, kv, emb_dim).astype(np.float32)
@@ -1303,19 +1412,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1303
  norm_obj,
1304
  d_word, d_char
1305
  ):
1306
- """
1307
- Run HDBSCAN (if requested and available) or MiniBatchKMeans.
1308
- """
1309
  n = X_space.shape[0]
1310
 
1311
- # NEW: trivial/tiny partition handling
1312
  if n <= 1:
1313
  labels = np.zeros((n,), dtype=int) if n == 1 else np.array([], dtype=int)
1314
  centers = None
1315
  chosen_k = int(n) if n > 0 else 0
1316
  return labels, centers, chosen_k
1317
  if n < 10:
1318
- # avoid unstable large-k on tiny sets
1319
  k_small = min(max(2, n // 2), n)
1320
  kmeans = MiniBatchKMeans(
1321
  n_clusters=int(k_small),
@@ -1341,7 +1445,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1341
  chosen_k = int(len(set([l for l in labels if l >= 0])))
1342
  return labels, centers, chosen_k
1343
 
1344
- # Otherwise MiniBatchKMeans
1345
  if bool(auto_k):
1346
  if use_lsa and isinstance(X_space, np.ndarray):
1347
  k, _ = choose_k_by_kneedle(X_space, ks=(50, 100, 150, 200, 300, 400, 500))
@@ -1373,7 +1476,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1373
  chosen_k = int(len(set(labels)))
1374
  return labels, centers, chosen_k
1375
 
1376
- # Main logic starts
1377
  trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
1378
  extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
1379
  extra_terms_lower = [t.lower() for t in extra_terms]
@@ -1424,7 +1526,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1424
  if bool(use_embeddings):
1425
  kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
1426
 
1427
- # >>> FIX: keep original indices when splitting (no reset_index here)
1428
  parts = []
1429
  if bool(per_language) and "lang" in df_main.columns:
1430
  for lang_code, grp in df_main.groupby("lang", dropna=False):
@@ -1488,18 +1589,26 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1488
  d_word=d_word,
1489
  d_char=d_char,
1490
  )
1491
- k_agg += chosen_k
 
 
 
 
 
 
 
 
1492
 
1493
  term_names = cluster_labels_pmi_bigram(
1494
- texts=texts, labels=labels, subjects=subjects_only, topn=6, subject_alpha=0.75, global_ubiq_cut=0.20
 
1495
  )
1496
  term_names_global.update({int(k): v for k, v in term_names.items()})
1497
 
1498
- # >>> We kept original indices in df_part, so Series align to df_main on concat
1499
  labels_list.append(pd.Series(labels, index=df_part.index))
1500
  cluster_name_list.append(
1501
  pd.Series(
1502
- [term_names.get(int(c), f"noise_{int(c)}" if c < 0 else f"cluster_{int(c)}") for c in labels],
1503
  index=df_part.index,
1504
  )
1505
  )
@@ -1517,7 +1626,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1517
  df_main["cluster_name"] = "unclustered"
1518
  df_main["anomaly_score"] = np.nan
1519
 
1520
- # >>> Use .loc to avoid chained-assignment warnings
1521
  if len(df_news):
1522
  df_news.loc[:, "cluster_id"] = -1
1523
  df_news.loc[:, "cluster_name"] = "newsletter/news"
@@ -1546,7 +1654,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1546
  except Exception:
1547
  pass
1548
 
1549
- # Summaries and UI updates...
1550
  cluster_counts = (
1551
  df.groupby(["cluster_id", "cluster_name"])
1552
  .size()
@@ -1585,7 +1692,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1585
  "max_df": float(max_df),
1586
  "use_bigrams": bool(use_bigrams),
1587
  }
1588
- status_md = f"**Processed {len(df):,} emails** | k = {k_agg}"
1589
 
1590
  svd_obj_out = svd_obj_local if single_partition else None
1591
  norm_obj_out = norm_obj_local if single_partition else None
@@ -1710,13 +1817,17 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1710
 
1711
  return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
1712
 
1713
- def refresh_results(df, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir):
1714
  if df is None or len(df) == 0:
1715
  return pd.DataFrame()
1716
- filt = _apply_filters(df, cluster, domain, sender, lang, sentiment, tag, start, end)
 
 
1717
  return _sort_results(filt, sort_by, sort_dir)
1718
 
1719
- for ctrl in [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir]:
 
 
1720
  ctrl.change(
1721
  refresh_results,
1722
  inputs=[
@@ -1731,14 +1842,17 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1731
  date_end,
1732
  sort_by,
1733
  sort_dir,
 
1734
  ],
1735
  outputs=[results_df],
1736
  )
1737
 
 
1738
  reset_btn.click(
1739
- lambda: ["(any)"] * 6 + [""] * 2 + ["corruption_score", "desc"],
1740
  [],
1741
- [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
 
1742
  ).then(
1743
  refresh_results,
1744
  inputs=[
@@ -1753,21 +1867,25 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1753
  date_end,
1754
  sort_by,
1755
  sort_dir,
 
1756
  ],
1757
  outputs=[results_df],
1758
  )
1759
 
 
1760
  def _tokenize_query(q: str) -> List[str]:
1761
- return [p.strip() for p in re.split(r"\s+", q) if p.strip()][:8]
1762
 
1763
  def _project_query_to_lsa(q_vec, svd, norm) -> Optional[np.ndarray]:
1764
  try:
1765
  return norm.transform(svd.transform(q_vec)).astype(np.float32)
1766
- except:
1767
  return None
1768
 
1769
  def _vectorize_query(q, vec_state, corpus_texts):
 
1770
  char_min_df = 1 if len(corpus_texts) <= 1 else 2
 
1771
  if vec_state.get("use_hashing"):
1772
  hv = HashingVectorizer(
1773
  analyzer="word",
@@ -1775,7 +1893,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1775
  n_features=2 ** vec_state.get("hash_bits", 18),
1776
  token_pattern=TOKEN_PATTERN,
1777
  lowercase=True,
 
 
1778
  )
 
1779
  counts = hv.transform(corpus_texts)
1780
  tfidf_tr = TfidfTransformer().fit(counts)
1781
  q_word = tfidf_tr.transform(hv.transform([q]))
@@ -1789,21 +1910,24 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1789
  token_pattern=TOKEN_PATTERN,
1790
  lowercase=True,
1791
  stop_words=STOPWORD_FOR_VEC,
 
1792
  )
1793
  tf = cv.fit_transform(corpus_texts)
1794
  bm25 = BM25Transformer().fit(tf)
1795
  q_word = bm25.transform(cv.transform([q]))
1796
 
1797
  char_vec = CharTfidf(
1798
- analyzer="char", ngram_range=(3, 5), min_df=char_min_df, max_features=100_000, lowercase=True
1799
  ).fit(corpus_texts)
1800
  q_char = char_vec.transform([q])
 
1801
  return hstack([q_word, q_char * 0.20], format="csr")
1802
 
1803
  def search_fn(q, df, vec, X_red, index, use_lsa, use_faiss, svd, norm, sort, sdir):
1804
  if not q or df is None or vec is None or index is None:
1805
  return pd.DataFrame(), []
1806
 
 
1807
  mask = ~df["cluster_id"].isin([-1, -2, -3])
1808
  df_main = df[mask].reset_index(drop=True)
1809
  if df_main.empty:
@@ -1812,7 +1936,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1812
  q_terms = _tokenize_query(q)
1813
  q_vec = _vectorize_query(q, vec, list(df_main.apply(enrich_text, axis=1)))
1814
 
1815
- q_emb = _project_query_to_lsa(q_vec, svd, norm) if use_lsa and svd and norm else q_vec
1816
  if q_emb is None:
1817
  return pd.DataFrame(), q_terms
1818
 
@@ -1821,13 +1945,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1821
  return pd.DataFrame(), q_terms
1822
 
1823
  if isinstance(index, NearestNeighbors):
 
1824
  if hasattr(index, "n_samples_fit_") and index.n_samples_fit_ <= 1:
1825
  return pd.DataFrame(), q_terms
1826
  dists, inds = index.kneighbors(q_emb, n_neighbors=n_req)
1827
  sims = 1.0 - dists[0]
1828
  results = df_main.iloc[inds[0]].copy()
1829
  results["search_score"] = sims
1830
- elif use_faiss and FAISS_OK and isinstance(index, faiss.Index):
1831
  D, I = index.search(q_emb.astype(np.float32), k=n_req)
1832
  results = df_main.iloc[I[0]].copy()
1833
  results["search_score"] = D[0]
@@ -1854,12 +1979,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1854
  outputs=[results_df, state_query_terms],
1855
  )
1856
 
 
1857
  def on_row_select(evt: gr.SelectData, table, df, term_names, q_terms, extra_terms, do_highlight):
1858
  if evt.index is None or table is None or len(table) == 0 or df is None or len(df) == 0:
1859
  return ""
1860
  row_idx = evt.index[0]
1861
  sel = table.iloc[row_idx]
1862
 
 
1863
  cand = df[
1864
  (df["subject"] == sel.get("subject"))
1865
  & (df["from_email"] == sel.get("from_email"))
@@ -1874,7 +2001,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1874
  cid = int(row.get("cluster_id", -99))
1875
  clabel = term_names.get(cid, row.get("cluster_name")) if term_names else row.get("cluster_name")
1876
  return build_highlighted_html(
1877
- row, query_terms=q_terms, cluster_label=clabel, do_highlight=do_highlight, extra_terms=extra_terms
 
 
 
 
1878
  )
1879
 
1880
  results_df.select(
@@ -1883,12 +2014,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1883
  outputs=[email_view],
1884
  )
1885
 
 
1886
  def on_click_filter(evt: gr.SelectData, df_sum: pd.DataFrame, col_name: str, out_comp: gr.Dropdown):
1887
  if evt.index is None or df_sum is None or df_sum.empty:
1888
  return gr.update()
1889
  val = df_sum.iloc[evt.index[0]][col_name]
1890
  return gr.update(value=val)
1891
 
 
1892
  cluster_counts_df.select(
1893
  lambda evt, df: on_click_filter(evt, df, "label", cluster_drop), [cluster_counts_df], [cluster_drop]
1894
  ).then(
@@ -1905,9 +2038,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1905
  date_end,
1906
  sort_by,
1907
  sort_dir,
 
1908
  ],
1909
  outputs=[results_df],
1910
  )
 
 
1911
  domain_counts_df.select(
1912
  lambda evt, df: on_click_filter(evt, df, "from_domain", domain_drop), [domain_counts_df], [domain_drop]
1913
  ).then(
@@ -1924,9 +2060,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1924
  date_end,
1925
  sort_by,
1926
  sort_dir,
 
1927
  ],
1928
  outputs=[results_df],
1929
  )
 
 
1930
  sender_counts_df.select(
1931
  lambda evt, df: on_click_filter(evt, df, "from_email", sender_drop), [sender_counts_df], [sender_drop]
1932
  ).then(
@@ -1943,11 +2082,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1943
  date_end,
1944
  sort_by,
1945
  sort_dir,
 
1946
  ],
1947
  outputs=[results_df],
1948
  )
1949
 
1950
-
1951
  if __name__ == "__main__":
1952
  # Disable SSR to avoid handler arity warnings under server-side rendering
1953
- demo.launch(ssr_mode=False)
 
227
  NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
228
  ONE_CHAR_RE = re.compile(r"^.$")
229
 
230
+ # ---- NEW junk token guards (base64/hex/trackers/very-long) ----
231
+ LONG_ALNUM_RE = re.compile(r"^[A-Za-z0-9_-]{24,}$") # long tracking/b64-ish
232
+ HEXISH_RE = re.compile(r"^(?:[A-Fa-f0-9]{8,})$") # long hex blobs
233
+ DIGIT_HEAVY_RE = re.compile(r"^(?:\D*\d){6,}\D*$") # too many digits
234
+ UNDERSCORE_HEAVY_RE = re.compile(r"^[A-Za-z0-9]*_[A-Za-z0-9_]*$")
235
+
236
  # This stoplist is used by the CountVectorizer (MUST be list for sklearn)
237
  STOPWORD_FOR_VEC = sorted(EN_STOP | HE_STOP | STOP_TERMS)
238
 
239
  def _is_junk_term(t: str) -> bool:
240
+ tl = (t or "").strip().lower()
241
+ if not tl: return True
242
+ if tl in STOP_TERMS or tl in EN_STOP or tl in HE_STOP or tl in MONTHS: return True
243
  if EMAIL_LIKE_RE.search(tl): return True
244
  if YEAR_RE.match(tl): return True
245
  if NUMERIC_RE.match(tl): return True
246
  if ONE_CHAR_RE.match(tl): return True
247
+ if LONG_ALNUM_RE.match(t): return True
248
+ if HEXISH_RE.match(t): return True
249
+ if DIGIT_HEAVY_RE.match(t): return True
250
+ if UNDERSCORE_HEAVY_RE.match(t): return True
251
+ if len(t) > 40: return True
252
  return False
253
 
254
  def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
 
697
  tokens.append(lang_tok)
698
  return (t + " " + " ".join(tokens)).strip()
699
 
700
+ # =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST (+ coverage ≥30% preference) ===================
701
+ def cluster_labels_pmi_bigram(
702
+ texts,
703
+ labels,
704
+ subjects=None,
705
+ topn=6,
706
+ subject_alpha=0.75,
707
+ global_ubiq_cut=0.20,
708
+ subject_min_cov=0.30 # NEW: prefer subject terms that appear in ≥30% of a cluster's subjects
709
+ ):
710
  """
711
  Improved labeler:
712
  - Considers bigrams AND trigrams (PMI vs. global)
713
  - Class-TFIDF unigrams with subject coverage boost
714
  - Suppresses globally ubiquitous tokens/phrases (appear in >20% docs by default)
715
+ - NEW: prefers subject terms that occur in ≥30% of cluster subjects (can be tuned via subject_min_cov)
716
  """
717
  import math as _math
718
  from collections import Counter, defaultdict
 
724
  def is_junk_token(tok: str) -> bool:
725
  if _is_junk_term(tok): return True
726
  tl = tok.lower()
727
+ if tl.startswith("__"): return True # our feature flags
 
 
728
  if "@" in tl: return True
729
  if tl.isascii() and len(tl) <= 2: return True
730
+ if LONG_ALNUM_RE.match(tok) or HEXISH_RE.match(tok) or DIGIT_HEAVY_RE.match(tok): return True
731
+ if len(tok) > 40: return True
732
+ # strip punctuation-heavy artifacts (URLs already replaced with 'URL')
733
+ if re.search(r"[^\w\-’']", tl): return True
734
  return False
735
 
736
  def tokenize_clean(t):
 
790
  for c in sorted(set(int(x) for x in labels)):
791
  n_docs_c = max(1, per_c_doc_count[c])
792
 
793
+ # ===== PMI bigrams & trigrams with subject-coverage boost & ≥30% preference =====
794
  phrases = []
795
  for store, glob_df, subj_docs, n in (
796
  (per_c_bg[c], glob_df_bg, per_c_subj_bg_docs[c], 2),
 
809
  cov = 0.0
810
  if have_subjects:
811
  cov = subj_docs[ng] / n_docs_c
812
+ # add a preference bump if subject coverage ≥ threshold
813
+ if cov >= subject_min_cov:
814
+ score += 0.6 # modest, ensures such terms bubble up
815
  score += subject_alpha * cov
816
+ scored.append((score, cov, ng))
817
+ # prefer subject-coverage ≥ threshold first, then highest score
818
+ scored.sort(key=lambda x: (x[1] >= subject_min_cov, x[0]), reverse=True)
819
  take = max(1, topn // (3 if n == 3 else 2))
820
+ phrases.extend([p for _, _, p in scored[:take]])
821
 
822
+ # ===== Class-TFIDF unigrams with subject coverage boost & ≥30% preference =====
823
  docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
824
  docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
825
  corpus = [docs_c[0], docs_bg[0]]
 
831
  vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
832
  row = X[0].toarray().ravel()
833
 
 
834
  subj_cov = np.zeros_like(row)
835
+ subj_cov_frac = np.zeros_like(row)
836
+ vocab_index = {t:i for i,t in enumerate(vocab)}
837
  if have_subjects:
 
838
  for tok, cnt_docs in per_c_subj_uni_docs[c].items():
839
  if tok in vocab_index and not is_junk_token(tok):
840
+ i = vocab_index[tok]
841
+ frac = cnt_docs / n_docs_c
842
+ subj_cov[i] = frac
843
+ subj_cov_frac[i] = frac
844
 
845
+ # base + subject alpha
846
  row_boosted = row + subject_alpha * subj_cov
847
+ # final score gets a preference bump if subj coverage ≥ threshold (but not a hard filter)
848
+ pref_bump = (subj_cov_frac >= subject_min_cov).astype(row_boosted.dtype) * 0.6
849
+ final = row_boosted + pref_bump
850
+
851
+ order = final.argsort()[::-1]
852
  unis = []
853
  for i in order:
854
  tok = vocab[i]
 
956
  return seeds_red
957
  return None
958
 
959
+ # =================== NEW: cluster stabilizer (merge near-dupes + reassign tiny → big; else NOISE=-3) ===================
960
+ def _centroids_from_labels(X, labels):
961
+ labs = np.asarray(labels, dtype=int)
962
+ uniq = np.unique(labs)
963
+ cents = {}
964
+ if isinstance(X, np.ndarray):
965
+ for c in uniq:
966
+ idx = (labs == c)
967
+ if not np.any(idx): continue
968
+ v = X[idx].mean(axis=0)
969
+ n = np.linalg.norm(v)
970
+ if n > 0: v = v / n
971
+ cents[int(c)] = v.astype(np.float32)
972
+ return cents
973
+ # CSR sparse
974
+ X = X.tocsr()
975
+ for c in uniq:
976
+ rows = np.where(labs == c)[0]
977
+ if rows.size == 0: continue
978
+ sub = X[rows]
979
+ v = np.asarray(sub.mean(axis=0)).ravel()
980
+ n = np.linalg.norm(v)
981
+ if n > 0: v = v / n
982
+ cents[int(c)] = v.astype(np.float32)
983
+ return cents
984
+
985
+ def _cosine_sim_to_centroids(vecs, centroids):
986
+ if not centroids:
987
+ return None, None
988
+ keys = list(centroids.keys())
989
+ C = np.stack([centroids[k] for k in keys], axis=0) # (k,d)
990
+ if isinstance(vecs, np.ndarray):
991
+ sims = vecs @ C.T
992
+ else:
993
+ sims = vecs.dot(C.T)
994
+ if hasattr(sims, "toarray"): sims = sims.toarray()
995
+ best_idx = np.argmax(sims, axis=1)
996
+ best_lab = np.array([keys[i] for i in best_idx], dtype=int)
997
+ best_sim = sims[np.arange(sims.shape[0]), best_idx]
998
+ return best_lab, best_sim
999
+
1000
+ def stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35):
1001
+ labs = np.asarray(labels, dtype=int)
1002
+
1003
+ # 1) merge very close centroids
1004
+ cents = _centroids_from_labels(X_space, labs)
1005
+ keys = sorted([k for k in cents.keys() if k >= 0])
1006
+ if len(keys) >= 2:
1007
+ C = np.stack([cents[k] for k in keys], axis=0)
1008
+ sims = C @ C.T
1009
+ parent = {k:k for k in keys}
1010
+ def find(a):
1011
+ while parent[a]!=a:
1012
+ a = parent[a]
1013
+ return a
1014
+ for i in range(len(keys)):
1015
+ for j in range(i+1, len(keys)):
1016
+ if sims[i,j] >= float(merge_thresh):
1017
+ ri, rj = find(keys[i]), find(keys[j])
1018
+ if ri != rj:
1019
+ parent[rj] = ri
1020
+ root = {k: find(k) for k in keys}
1021
+ merge_map = {k: root[k] for k in keys}
1022
+ labs = np.array([merge_map.get(int(c), int(c)) for c in labs], dtype=int)
1023
+ cents = _centroids_from_labels(X_space, labs)
1024
+
1025
+ # 2) reassign tiny clusters to nearest big centroid (else noise -3)
1026
+ vc = pd.Series(labs).value_counts()
1027
+ big_labs = set(vc[vc >= int(min_size)].index.tolist())
1028
+ small_labs = set(vc[vc < int(min_size)].index.tolist())
1029
+ big_cents = {c: cents[c] for c in big_labs if c in cents and c >= 0}
1030
+
1031
+ NOISE_ID = -3
1032
+ if small_labs and big_cents:
1033
+ idx_small = np.where(pd.Series(labs).isin(small_labs))[0]
1034
+ if idx_small.size > 0:
1035
+ sub = X_space[idx_small] if not isinstance(X_space, np.ndarray) else X_space[idx_small]
1036
+ best_lab, best_sim = _cosine_sim_to_centroids(sub, big_cents)
1037
+ reassigned = np.where(best_sim >= float(reassign_thresh), best_lab, NOISE_ID)
1038
+ labs[idx_small] = reassigned
1039
+
1040
+ return labs
1041
+
1042
  # =================== Scoring & Flags ===================
1043
  def _hour_of(dt_iso: str) -> Optional[int]:
1044
  try:
1045
  if not dt_iso: return None
1046
  dt = pd.to_datetime(dt_iso, utc=True, errors="coerce")
1047
  if pd.isna(dt): return None
 
1048
  return int(dt.hour)
1049
  except Exception:
1050
  return None
 
1074
  body_len = len(row.get("body_text",""))
1075
  if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
1076
  score += 0.5
 
1077
  fd = (row.get("from_domain") or "").lower()
1078
  if fd in PERSONAL_DOMAINS and fd not in trusted_domains:
1079
  score += 0.5
 
1080
  h = _hour_of(row.get("date") or "")
1081
  if h is not None and (h < 6 or h > 22):
1082
  score += 0.3
 
1156
  date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
1157
  sort_by = gr.Dropdown(label="Sort by", choices=["corruption_score","date","anomaly_score","search_score"], value="corruption_score")
1158
  sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
1159
+ # NEW: hide noise toggle
1160
+ hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
1161
 
1162
  with gr.Row():
1163
  run_btn = gr.Button("Process", variant="primary")
 
1235
  tag_value: str,
1236
  start: str,
1237
  end: str,
1238
+ hide_noise_flag: bool = False, # NEW
1239
  ) -> pd.DataFrame:
1240
  out = df
1241
  if cluster and cluster != "(any)":
 
1266
  out = out[pd.to_datetime(out["date"], utc=True, errors="coerce") <= dt]
1267
  except Exception:
1268
  pass
1269
+ if hide_noise_flag:
1270
+ out = out[out["cluster_id"] != -3]
1271
  return out
1272
 
1273
  # -------- Simple social network stats --------
1274
  def social_stats(df: pd.DataFrame) -> pd.DataFrame:
 
1275
  deg = {}
1276
  def add_edge(a,b):
1277
  if not a or not b or a==b: return
 
1302
 
1303
  # === Vectorization & Clustering (UPGRADED) ===
1304
  def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
 
1305
  texts = list(df_in.apply(enrich_text, axis=1))
1306
  subjects_only = list(df_in["subject"].fillna(""))
1307
  return texts, subjects_only
 
1343
  count_vec = None; bm25 = None
1344
  return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full
1345
 
 
1346
  count_vec = CountVectorizer(
1347
  analyzer="word", ngram_range=ngram_range,
1348
  max_features=int(max_features) if max_features else None,
 
1372
 
1373
  n_docs = X_full.shape[0]
1374
  n_feats = X_full.shape[1]
 
1375
  max_components = max(1, min(n_docs, n_feats) - 1)
1376
  n_comp = int(min(int(lsa_dim or 256), max_components))
1377
 
1378
  if n_comp < 2:
 
1379
  return None, None, None
1380
 
1381
  svd_obj = TruncatedSVD(n_components=n_comp, random_state=0)
 
1386
  return X_reduced, svd_obj, norm_obj
1387
 
1388
  def _attach_embeddings(texts, X_reduced_or_full, use_lsa, kv, emb_dim, weight):
 
 
 
1389
  if kv is None or emb_dim <= 0 or weight <= 0.0:
1390
  return X_reduced_or_full, emb_dim
1391
  doc_embs = _build_doc_embeddings(texts, kv, emb_dim).astype(np.float32)
 
1412
  norm_obj,
1413
  d_word, d_char
1414
  ):
 
 
 
1415
  n = X_space.shape[0]
1416
 
 
1417
  if n <= 1:
1418
  labels = np.zeros((n,), dtype=int) if n == 1 else np.array([], dtype=int)
1419
  centers = None
1420
  chosen_k = int(n) if n > 0 else 0
1421
  return labels, centers, chosen_k
1422
  if n < 10:
 
1423
  k_small = min(max(2, n // 2), n)
1424
  kmeans = MiniBatchKMeans(
1425
  n_clusters=int(k_small),
 
1445
  chosen_k = int(len(set([l for l in labels if l >= 0])))
1446
  return labels, centers, chosen_k
1447
 
 
1448
  if bool(auto_k):
1449
  if use_lsa and isinstance(X_space, np.ndarray):
1450
  k, _ = choose_k_by_kneedle(X_space, ks=(50, 100, 150, 200, 300, 400, 500))
 
1476
  chosen_k = int(len(set(labels)))
1477
  return labels, centers, chosen_k
1478
 
 
1479
  trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
1480
  extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
1481
  extra_terms_lower = [t.lower() for t in extra_terms]
 
1526
  if bool(use_embeddings):
1527
  kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
1528
 
 
1529
  parts = []
1530
  if bool(per_language) and "lang" in df_main.columns:
1531
  for lang_code, grp in df_main.groupby("lang", dropna=False):
 
1589
  d_word=d_word,
1590
  d_char=d_char,
1591
  )
1592
+ # NEW: stabilize per partition
1593
+ labels = stabilize_labels(
1594
+ X_space, labels,
1595
+ min_size=40,
1596
+ merge_thresh=0.96,
1597
+ reassign_thresh=0.35,
1598
+ )
1599
+
1600
+ k_agg += len(set(labels))
1601
 
1602
  term_names = cluster_labels_pmi_bigram(
1603
+ texts=texts, labels=labels, subjects=subjects_only,
1604
+ topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
1605
  )
1606
  term_names_global.update({int(k): v for k, v in term_names.items()})
1607
 
 
1608
  labels_list.append(pd.Series(labels, index=df_part.index))
1609
  cluster_name_list.append(
1610
  pd.Series(
1611
+ [term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels],
1612
  index=df_part.index,
1613
  )
1614
  )
 
1626
  df_main["cluster_name"] = "unclustered"
1627
  df_main["anomaly_score"] = np.nan
1628
 
 
1629
  if len(df_news):
1630
  df_news.loc[:, "cluster_id"] = -1
1631
  df_news.loc[:, "cluster_name"] = "newsletter/news"
 
1654
  except Exception:
1655
  pass
1656
 
 
1657
  cluster_counts = (
1658
  df.groupby(["cluster_id", "cluster_name"])
1659
  .size()
 
1692
  "max_df": float(max_df),
1693
  "use_bigrams": bool(use_bigrams),
1694
  }
1695
+ status_md = f"**Processed {len(df):,} emails** | clusters ~ {len(cluster_counts):,} (showing top 500)"
1696
 
1697
  svd_obj_out = svd_obj_local if single_partition else None
1698
  norm_obj_out = norm_obj_local if single_partition else None
 
1817
 
1818
  return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
1819
 
1820
+ def refresh_results(df, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
1821
  if df is None or len(df) == 0:
1822
  return pd.DataFrame()
1823
+ filt = _apply_filters(
1824
+ df, cluster, domain, sender, lang, sentiment, tag, start, end, hide_noise_flag=bool(hide_noise_flag)
1825
+ )
1826
  return _sort_results(filt, sort_by, sort_dir)
1827
 
1828
+ # Re-run when any filter control changes (including hide_noise)
1829
+ for ctrl in [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
1830
+ date_start, date_end, sort_by, sort_dir, hide_noise]:
1831
  ctrl.change(
1832
  refresh_results,
1833
  inputs=[
 
1842
  date_end,
1843
  sort_by,
1844
  sort_dir,
1845
+ hide_noise,
1846
  ],
1847
  outputs=[results_df],
1848
  )
1849
 
1850
+ # Reset filters (sets selects to (any), dates blank, sort default, and hide_noise= True)
1851
  reset_btn.click(
1852
+ lambda: ["(any)"] * 6 + [""] * 2 + ["corruption_score", "desc"] + [True],
1853
  [],
1854
+ [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
1855
+ date_start, date_end, sort_by, sort_dir, hide_noise],
1856
  ).then(
1857
  refresh_results,
1858
  inputs=[
 
1867
  date_end,
1868
  sort_by,
1869
  sort_dir,
1870
+ hide_noise,
1871
  ],
1872
  outputs=[results_df],
1873
  )
1874
 
1875
+ # -------- Search helpers --------
1876
  def _tokenize_query(q: str) -> List[str]:
1877
+ return [p.strip() for p in re.split(r"\s+", q or "") if p.strip()][:8]
1878
 
1879
  def _project_query_to_lsa(q_vec, svd, norm) -> Optional[np.ndarray]:
1880
  try:
1881
  return norm.transform(svd.transform(q_vec)).astype(np.float32)
1882
+ except Exception:
1883
  return None
1884
 
1885
  def _vectorize_query(q, vec_state, corpus_texts):
1886
+ # Build the same features for the query that we used for docs
1887
  char_min_df = 1 if len(corpus_texts) <= 1 else 2
1888
+
1889
  if vec_state.get("use_hashing"):
1890
  hv = HashingVectorizer(
1891
  analyzer="word",
 
1893
  n_features=2 ** vec_state.get("hash_bits", 18),
1894
  token_pattern=TOKEN_PATTERN,
1895
  lowercase=True,
1896
+ norm=None,
1897
+ alternate_sign=False,
1898
  )
1899
+ # Fit TF-IDF weights from corpus
1900
  counts = hv.transform(corpus_texts)
1901
  tfidf_tr = TfidfTransformer().fit(counts)
1902
  q_word = tfidf_tr.transform(hv.transform([q]))
 
1910
  token_pattern=TOKEN_PATTERN,
1911
  lowercase=True,
1912
  stop_words=STOPWORD_FOR_VEC,
1913
+ dtype=np.float32,
1914
  )
1915
  tf = cv.fit_transform(corpus_texts)
1916
  bm25 = BM25Transformer().fit(tf)
1917
  q_word = bm25.transform(cv.transform([q]))
1918
 
1919
  char_vec = CharTfidf(
1920
+ analyzer="char", ngram_range=(3, 5), min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32
1921
  ).fit(corpus_texts)
1922
  q_char = char_vec.transform([q])
1923
+
1924
  return hstack([q_word, q_char * 0.20], format="csr")
1925
 
1926
  def search_fn(q, df, vec, X_red, index, use_lsa, use_faiss, svd, norm, sort, sdir):
1927
  if not q or df is None or vec is None or index is None:
1928
  return pd.DataFrame(), []
1929
 
1930
+ # Search ignores newsletters/alerts/noise by default
1931
  mask = ~df["cluster_id"].isin([-1, -2, -3])
1932
  df_main = df[mask].reset_index(drop=True)
1933
  if df_main.empty:
 
1936
  q_terms = _tokenize_query(q)
1937
  q_vec = _vectorize_query(q, vec, list(df_main.apply(enrich_text, axis=1)))
1938
 
1939
+ q_emb = _project_query_to_lsa(q_vec, svd, norm) if use_lsa and svd is not None and norm is not None else q_vec
1940
  if q_emb is None:
1941
  return pd.DataFrame(), q_terms
1942
 
 
1945
  return pd.DataFrame(), q_terms
1946
 
1947
  if isinstance(index, NearestNeighbors):
1948
+ # brute-force cosine on reduced space
1949
  if hasattr(index, "n_samples_fit_") and index.n_samples_fit_ <= 1:
1950
  return pd.DataFrame(), q_terms
1951
  dists, inds = index.kneighbors(q_emb, n_neighbors=n_req)
1952
  sims = 1.0 - dists[0]
1953
  results = df_main.iloc[inds[0]].copy()
1954
  results["search_score"] = sims
1955
+ elif use_faiss and FAISS_OK and hasattr(index, "search"):
1956
  D, I = index.search(q_emb.astype(np.float32), k=n_req)
1957
  results = df_main.iloc[I[0]].copy()
1958
  results["search_score"] = D[0]
 
1979
  outputs=[results_df, state_query_terms],
1980
  )
1981
 
1982
+ # -------- Reader selection (build highlighted HTML) --------
1983
  def on_row_select(evt: gr.SelectData, table, df, term_names, q_terms, extra_terms, do_highlight):
1984
  if evt.index is None or table is None or len(table) == 0 or df is None or len(df) == 0:
1985
  return ""
1986
  row_idx = evt.index[0]
1987
  sel = table.iloc[row_idx]
1988
 
1989
+ # Try to match the original row
1990
  cand = df[
1991
  (df["subject"] == sel.get("subject"))
1992
  & (df["from_email"] == sel.get("from_email"))
 
2001
  cid = int(row.get("cluster_id", -99))
2002
  clabel = term_names.get(cid, row.get("cluster_name")) if term_names else row.get("cluster_name")
2003
  return build_highlighted_html(
2004
+ row,
2005
+ query_terms=q_terms,
2006
+ cluster_label=clabel,
2007
+ do_highlight=do_highlight,
2008
+ extra_terms=extra_terms,
2009
  )
2010
 
2011
  results_df.select(
 
2014
  outputs=[email_view],
2015
  )
2016
 
2017
+ # Click-to-filter conveniences for summary tables
2018
  def on_click_filter(evt: gr.SelectData, df_sum: pd.DataFrame, col_name: str, out_comp: gr.Dropdown):
2019
  if evt.index is None or df_sum is None or df_sum.empty:
2020
  return gr.update()
2021
  val = df_sum.iloc[evt.index[0]][col_name]
2022
  return gr.update(value=val)
2023
 
2024
+ # Cluster summary → set cluster filter
2025
  cluster_counts_df.select(
2026
  lambda evt, df: on_click_filter(evt, df, "label", cluster_drop), [cluster_counts_df], [cluster_drop]
2027
  ).then(
 
2038
  date_end,
2039
  sort_by,
2040
  sort_dir,
2041
+ hide_noise,
2042
  ],
2043
  outputs=[results_df],
2044
  )
2045
+
2046
+ # Domain summary → set domain filter
2047
  domain_counts_df.select(
2048
  lambda evt, df: on_click_filter(evt, df, "from_domain", domain_drop), [domain_counts_df], [domain_drop]
2049
  ).then(
 
2060
  date_end,
2061
  sort_by,
2062
  sort_dir,
2063
+ hide_noise,
2064
  ],
2065
  outputs=[results_df],
2066
  )
2067
+
2068
+ # Sender summary → set sender filter
2069
  sender_counts_df.select(
2070
  lambda evt, df: on_click_filter(evt, df, "from_email", sender_drop), [sender_counts_df], [sender_drop]
2071
  ).then(
 
2082
  date_end,
2083
  sort_by,
2084
  sort_dir,
2085
+ hide_noise,
2086
  ],
2087
  outputs=[results_df],
2088
  )
2089
 
 
2090
  if __name__ == "__main__":
2091
  # Disable SSR to avoid handler arity warnings under server-side rendering
2092
+ demo.launch(ssr_mode=False)