wuhp commited on
Commit
08fef30
·
verified ·
1 Parent(s): 186377a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +431 -123
app.py CHANGED
@@ -81,7 +81,7 @@ TAXONOMY = {
81
  LOBBY_DOMAINS = set() # e.g., {"acme-lobby.com"}
82
  LEGAL_DOMAINS = set() # e.g., {"biglaw.com","firmlaw.com"}
83
 
84
- def _contains_any(text: str, terms: list[str]) -> bool:
85
  if not text or not terms: return False
86
  tl = text.lower()
87
  return any(t for t in terms if t and t.lower() in tl)
@@ -111,7 +111,7 @@ TIE_MARGIN = 1.0
111
 
112
  def route_email_row(row: pd.Series) -> str:
113
  text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
114
- scores: dict[str,float] = {b: 0.0 for b in TAXONOMY.keys()}
115
  # lexicon points
116
  for b, terms in TAXONOMY.items():
117
  if not terms:
@@ -1194,7 +1194,245 @@ def compute_context_anomaly(df_in: pd.DataFrame) -> pd.DataFrame:
1194
  df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
1195
  return df.drop(columns=["_if_pct","_if_pts","_rule_pts","_corr_pts"], errors="ignore")
1196
 
1197
- # =================== Gradio UI ===================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1198
  CSS = """
1199
  :root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
1200
  .email-card { background:#ffffff; color:#111827; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.08); }
@@ -1213,8 +1451,13 @@ hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
1213
  .cursor { cursor:pointer; }
1214
  """
1215
 
1216
- with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
 
1217
  gr.Markdown("# Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans")
 
 
 
 
1218
 
1219
  with gr.Row():
1220
  inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
@@ -1233,7 +1476,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1233
  use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
1234
  lsa_dim = gr.Number(label="LSA components", value=256, precision=0)
1235
  auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
1236
- k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
1237
  mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
1238
  with gr.Row():
1239
  use_hdbscan = gr.Checkbox(label="Use HDBSCAN (auto-k, noise) on reduced vectors", value=False)
@@ -1263,14 +1506,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1263
  sender_drop = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
1264
  lang_drop = gr.Dropdown(label="Language", choices=["(any)"], value="(any)", allow_custom_value=False)
1265
  sentiment_drop = gr.Dropdown(label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)")
1266
- tag_drop = gr.Dropdown(label="Tag", choices=["(any)", "🚩suspect", "finance", "off-channel", "odd-hours", "personal-mail"], value="(any)")
1267
  with gr.Row():
1268
  date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
1269
  date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
1270
  sort_by = gr.Dropdown(label="Sort by", choices=["context_anomaly_score","corruption_score","date","anomaly_score","search_score"], value="context_anomaly_score")
1271
  sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
1272
- # NEW: hide noise toggle
1273
- hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
 
 
 
 
 
1274
 
1275
  with gr.Row():
1276
  run_btn = gr.Button("Process", variant="primary")
@@ -1287,6 +1535,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1287
  actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
1288
  offhours_df = gr.Dataframe(label="Off-hours & personal-mail hits", interactive=False, wrap=True)
1289
 
 
 
 
 
 
1290
  gr.Markdown("### Search")
1291
  with gr.Row():
1292
  search_query = gr.Textbox(label="Search (keywords, names, etc.)")
@@ -1294,7 +1547,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1294
  results_df = gr.Dataframe(label="Results (top 500 or top 50 for search)", interactive=True, wrap=True)
1295
  email_view = gr.HTML(label="Reader")
1296
 
1297
- # State
1298
  state_df = gr.State()
1299
  state_vec = gr.State()
1300
  state_X_reduced = gr.State()
@@ -1355,7 +1608,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1355
  if bucket and bucket != "(any)":
1356
  out = out[out["bucket"] == bucket]
1357
  if cluster and cluster != "(any)":
1358
- # Modified to parse the new cluster label format
1359
  m = re.match(r"^.*?(\-?\d+)\s+—", cluster)
1360
  if m:
1361
  cid = int(m.group(1))
@@ -1387,7 +1639,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1387
  out = out[out["cluster_id"] != -3]
1388
  return out
1389
 
1390
- # -------- Simple social network stats --------
1391
  def social_stats(df: pd.DataFrame) -> pd.DataFrame:
1392
  deg = {}
1393
  def add_edge(a,b):
@@ -1405,19 +1657,104 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1405
  out = pd.DataFrame(rows).sort_values("degree", ascending=False).head(50)
1406
  return out
1407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1408
  # -------- Main pipeline --------
1409
  def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
1410
  use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
1411
  trusted_domains_in, extra_keywords_in, highlight_toggle,
1412
- # NEW:
1413
  use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
1414
- per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary):
 
1415
  if inbox_file is None:
1416
- return ("**Please upload a file.**",
1417
- None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1418
- None, None, None, None, None)
 
 
 
 
 
 
 
 
 
 
1419
 
1420
- # === Vectorization & Clustering (UPGRADED) ===
1421
  def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
1422
  texts = list(df_in.apply(enrich_text, axis=1))
1423
  subjects_only = list(df_in["subject"].fillna(""))
@@ -1516,6 +1853,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1516
 
1517
  def _cluster_space(
1518
  X_space,
 
1519
  df_part: pd.DataFrame,
1520
  use_lsa: bool,
1521
  use_hdbscan: bool,
@@ -1531,11 +1869,15 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1531
  ):
1532
  n = X_space.shape[0]
1533
 
 
 
 
1534
  if n <= 1:
1535
  labels = np.zeros((n,), dtype=int) if n == 1 else np.array([], dtype=int)
1536
  centers = None
1537
  chosen_k = int(n) if n > 0 else 0
1538
- return labels, centers, chosen_k
 
1539
  if n < 10:
1540
  k_small = min(max(2, n // 2), n)
1541
  kmeans = MiniBatchKMeans(
@@ -1546,6 +1888,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1546
  )
1547
  labels = kmeans.fit_predict(X_space)
1548
  centers = getattr(kmeans, "cluster_centers_", None)
 
1549
  return labels, centers, int(len(set(labels)))
1550
 
1551
  if use_hdbscan and HDBSCAN_OK and isinstance(X_space, np.ndarray) and X_space.shape[0] >= max(50, hdb_min_cluster):
@@ -1559,9 +1902,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1559
  )
1560
  labels = clusterer.fit_predict(X_space)
1561
  centers = None
 
1562
  chosen_k = int(len(set([l for l in labels if l >= 0])))
1563
  return labels, centers, chosen_k
1564
 
 
1565
  if bool(auto_k):
1566
  if use_lsa and isinstance(X_space, np.ndarray):
1567
  k, _ = choose_k_by_kneedle(X_space, ks=(50, 100, 150, 200, 300, 400, 500))
@@ -1569,6 +1914,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1569
  k = auto_k_rule(X_space.shape[0])
1570
  else:
1571
  k = max(10, int(k_clusters or 350))
 
1572
 
1573
  init = None
1574
  if use_lsa and isinstance(X_space, np.ndarray) and count_vec is not None:
@@ -1590,9 +1936,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1590
  centers = kmeans.cluster_centers_ if hasattr(kmeans, "cluster_centers_") else None
1591
  if use_lsa and centers is not None:
1592
  labels = merge_close_clusters(labels, centers, thresh=0.95)
 
1593
  chosen_k = int(len(set(labels)))
1594
  return labels, centers, chosen_k
1595
 
 
1596
  trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
1597
  extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
1598
  extra_terms_lower = [t.lower() for t in extra_terms]
@@ -1600,8 +1948,15 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1600
  recs = _load_json_records(inbox_file.name)
1601
  if not recs:
1602
  return ("**No valid records found.**",
1603
- None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1604
- None, None, None, None, None)
 
 
 
 
 
 
 
1605
 
1606
  normd = []
1607
  for r in tqdm(recs, desc="Normalize", leave=False):
@@ -1611,20 +1966,28 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1611
  df = pd.DataFrame(normd)
1612
  if df.empty:
1613
  return ("**No usable email records.**",
1614
- None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1615
- None, None, None, None, None)
 
 
 
 
 
 
 
1616
 
1617
  df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
1618
  df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
1619
  df = compute_sentiment_column(df)
1620
 
1621
- # >>> NEW: stage-1 routing
1622
  df["bucket"] = df.apply(route_email_row, axis=1)
1623
  df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_domain", "")), axis=1)
1624
  df["is_notify"] = df.apply(lambda r: is_notification_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_email", ""), r.get("from_domain", "")), axis=1)
1625
  df.loc[df["is_news"] == True, "bucket"] = "Newsletters/Alerts"
1626
  df.loc[df["is_notify"] == True, "bucket"] = "IT/Security"
1627
 
 
1628
  flags = []
1629
  for _, row in df.iterrows():
1630
  f = []
@@ -1638,15 +2001,18 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1638
  flags.append(f)
1639
  df["flags"] = flags
1640
 
 
1641
  df_main = df[~df["bucket"].isin(["Newsletters/Alerts", "IT/Security"])].reset_index(drop=True)
1642
  df_news = df[df["bucket"] == "Newsletters/Alerts"].reset_index(drop=True)
1643
  df_alerts = df[df["bucket"] == "IT/Security"].reset_index(drop=True)
1644
 
 
1645
  kv = None
1646
  emb_dim = 0
1647
  if bool(use_embeddings):
1648
  kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
1649
 
 
1650
  parts = []
1651
  if bool(per_language):
1652
  for bkt, g_bucket in df_main.groupby("bucket", dropna=False):
@@ -1657,11 +2023,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1657
  parts.append(((bkt, "all"), grp.copy()))
1658
 
1659
  labels_list, cluster_name_list, anomaly_list = [], [], []
1660
- bucket_indexers = [] # keep index locations to reassign later
1661
  X_reduced_holder = None
1662
  term_names_global = {}
1663
  single_partition = (len(parts) == 1)
1664
- d_word_agg, d_char_agg, k_agg = 0, 0, 0
1665
  svd_obj_local, norm_obj_local = None, None
1666
 
1667
  for (bucket_name, _lang), df_part in parts:
@@ -1697,8 +2063,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1697
  except Exception:
1698
  pass
1699
 
1700
- labels, _, chosen_k = _cluster_space(
1701
  X_space=X_space,
 
1702
  df_part=df_part,
1703
  use_lsa=bool(use_lsa) and X_reduced is not None,
1704
  use_hdbscan=bool(use_hdbscan),
@@ -1713,25 +2080,21 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1713
  d_word=d_word,
1714
  d_char=d_char,
1715
  )
1716
- labels = stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35)
1717
- k_agg += len(set(labels))
1718
 
1719
  term_names = cluster_labels_pmi_bigram(
1720
  texts=texts, labels=labels, subjects=subjects_only,
1721
  topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
1722
  )
1723
 
1724
- # collect to write back in one go
1725
  bucket_indexers.append(df_part.index)
1726
  labels_list.append(pd.Series(labels, index=df_part.index))
1727
  cluster_name_list.append(pd.Series([term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels], index=df_part.index))
1728
  anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
1729
- # remember term names for the reader pill
1730
  term_names_global.update({int(k): v for k, v in term_names.items()})
1731
 
1732
  if single_partition and X_reduced is not None:
1733
  X_reduced_holder = X_reduced
1734
-
1735
  if labels_list:
1736
  df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_id"] = pd.concat(labels_list).sort_index()
1737
  df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_name"] = pd.concat(cluster_name_list).sort_index()
@@ -1741,6 +2104,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1741
  df_main["cluster_name"] = "unclustered"
1742
  df_main["anomaly_score"] = np.nan
1743
 
 
1744
  if len(df_news):
1745
  df_news.loc[:, "cluster_id"] = -1
1746
  df_news.loc[:, "cluster_name"] = "newsletter/news"
@@ -1750,10 +2114,22 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1750
  df_alerts.loc[:, "cluster_name"] = "system/alerts"
1751
  df_alerts.loc[:, "anomaly_score"] = np.nan
1752
 
 
1753
  df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
 
 
 
 
 
1754
  df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
1755
  df = compute_context_anomaly(df)
1756
 
 
 
 
 
 
 
1757
  index_obj = None
1758
  use_faiss_flag = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
1759
  if use_faiss_flag:
@@ -1770,6 +2146,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1770
  except Exception:
1771
  pass
1772
 
 
1773
  cluster_counts = (
1774
  df.groupby(["bucket", "cluster_id", "cluster_name"])
1775
  .size()
@@ -1816,9 +2193,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1816
  norm_obj_out = norm_obj_local if single_partition else None
1817
 
1818
  return (
1819
- status_md, cluster_counts, domain_counts, sender_counts, actors, offhours_table,
1820
- out_table, df, vec_state, X_reduced_holder, index_obj, term_names_global,
1821
- bool(use_lsa), use_faiss_flag,
 
 
 
 
 
1822
  gr.update(choices=cluster_choices, value="(any)"),
1823
  gr.update(choices=domain_choices, value="(any)"),
1824
  gr.update(choices=sender_choices, value="(any)"),
@@ -1828,6 +2210,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1828
  gr.update(choices=bucket_choices, value="(any)")
1829
  )
1830
 
 
1831
  (run_btn.click)(
1832
  process_file,
1833
  inputs=[
@@ -1836,46 +2219,24 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1836
  trusted_domains_in, extra_keywords_in, highlight_toggle,
1837
  use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
1838
  per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
 
1839
  ],
1840
  outputs=[
1841
  status, cluster_counts_df, domain_counts_df, sender_counts_df,
1842
- actors_df, offhours_df, results_df,
1843
- state_df, state_vec, state_X_reduced, state_index, state_term_names,
 
 
 
1844
  state_use_lsa, state_use_faiss,
1845
  cluster_drop, domain_drop, sender_drop, lang_drop,
1846
- state_svd, state_norm, state_dims, state_extra_terms, state_highlight,
 
1847
  bucket_drop,
1848
  ],
1849
  )
1850
 
1851
  # -------- Filtering & Search --------
1852
- def _sort_results(df, by, direction):
1853
- if df is None or len(df) == 0:
1854
- return pd.DataFrame()
1855
- tmp = df.copy()
1856
- if "date" in tmp.columns:
1857
- tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
1858
- else:
1859
- tmp["_dt"] = pd.NaT
1860
- by = by if by in tmp.columns else "context_anomaly_score"
1861
- asc = (direction == "asc")
1862
- sort_cols = [by]
1863
- if by == "date":
1864
- sort_cols = ["_dt"]
1865
- elif by in ["anomaly_score", "corruption_score", "context_anomaly_score"]:
1866
- sort_cols.append("_dt")
1867
-
1868
- tmp = tmp.sort_values(sort_cols, ascending=[asc, False])
1869
-
1870
- cols_out = [
1871
- "date","bucket","from_email","from_domain","subject","cluster_name","lang",
1872
- "tags","flags","sentiment","context_anomaly_score","corruption_score","anomaly_score"
1873
- ]
1874
- if "search_score" in tmp.columns:
1875
- cols_out.append("search_score")
1876
-
1877
- return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
1878
-
1879
  def refresh_results(df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
1880
  if df is None or len(df) == 0:
1881
  return pd.DataFrame()
@@ -1884,7 +2245,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1884
  )
1885
  return _sort_results(filt, sort_by, sort_dir)
1886
 
1887
- # Re-run when any filter control changes (including hide_noise)
1888
  for ctrl in [bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
1889
  date_start, date_end, sort_by, sort_dir, hide_noise]:
1890
  ctrl.change(
@@ -1911,57 +2272,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1911
  outputs=[results_df],
1912
  )
1913
 
1914
- # -------- Search helpers --------
1915
- def _tokenize_query(q: str) -> List[str]:
1916
- return [p.strip() for p in re.split(r"\s+", q or "") if p.strip()][:8]
1917
-
1918
- def _project_query_to_lsa(q_vec, svd, norm) -> Optional[np.ndarray]:
1919
- try:
1920
- return norm.transform(svd.transform(q_vec)).astype(np.float32)
1921
- except Exception:
1922
- return None
1923
-
1924
- def _vectorize_query(q, vec_state, corpus_texts):
1925
- # Build the same features for the query that we used for docs
1926
- char_min_df = 1 if len(corpus_texts) <= 1 else 2
1927
-
1928
- if vec_state.get("use_hashing"):
1929
- hv = HashingVectorizer(
1930
- analyzer="word",
1931
- ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
1932
- n_features=2 ** vec_state.get("hash_bits", 18),
1933
- token_pattern=TOKEN_PATTERN,
1934
- lowercase=True,
1935
- norm=None,
1936
- alternate_sign=False,
1937
- )
1938
- # Fit TF-IDF weights from corpus
1939
- counts = hv.transform(corpus_texts)
1940
- tfidf_tr = TfidfTransformer().fit(counts)
1941
- q_word = tfidf_tr.transform(hv.transform([q]))
1942
- else:
1943
- cv = CountVectorizer(
1944
- analyzer="word",
1945
- ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
1946
- max_features=vec_state.get("max_features"),
1947
- min_df=vec_state.get("min_df"),
1948
- max_df=vec_state.get("max_df"),
1949
- token_pattern=TOKEN_PATTERN,
1950
- lowercase=True,
1951
- stop_words=STOPWORD_FOR_VEC,
1952
- dtype=np.float32,
1953
- )
1954
- tf = cv.fit_transform(corpus_texts)
1955
- bm25 = BM25Transformer().fit(tf)
1956
- q_word = bm25.transform(cv.transform([q]))
1957
-
1958
- char_vec = CharTfidf(
1959
- analyzer="char", ngram_range=(3, 5), min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32
1960
- ).fit(corpus_texts)
1961
- q_char = char_vec.transform([q])
1962
-
1963
- return hstack([q_word, q_char * 0.20], format="csr")
1964
-
1965
  def search_fn(q, df, vec, X_red, index, use_lsa, use_faiss, svd, norm, sort, sdir):
1966
  if not q or df is None or vec is None or index is None:
1967
  return pd.DataFrame(), []
@@ -1984,7 +2295,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1984
  return pd.DataFrame(), q_terms
1985
 
1986
  if isinstance(index, NearestNeighbors):
1987
- # brute-force cosine on reduced space
1988
  if hasattr(index, "n_samples_fit_") and index.n_samples_fit_ <= 1:
1989
  return pd.DataFrame(), q_terms
1990
  dists, inds = index.kneighbors(q_emb, n_neighbors=n_req)
@@ -2009,7 +2319,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
2009
  outputs=[results_df, state_query_terms],
2010
  )
2011
 
2012
- # -------- Reader selection (build highlighted HTML) --------
2013
  def on_row_select(evt: gr.SelectData, table, df, term_names, q_terms, extra_terms, do_highlight):
2014
  if evt.index is None or table is None or len(table) == 0 or df is None or len(df) == 0:
2015
  return ""
@@ -2044,7 +2354,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
2044
  outputs=[email_view],
2045
  )
2046
 
2047
- # Click-to-filter conveniences for summary tables
2048
  def on_click_filter(evt: gr.SelectData, df_sum: pd.DataFrame, col_name: str, out_comp: gr.Dropdown):
2049
  if evt.index is None or df_sum is None or df_sum.empty:
2050
  return gr.update()
@@ -2057,7 +2367,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
2057
  r = df_sum.iloc[evt.index[0]]
2058
  return gr.update(value=r["bucket"]), gr.update(value=r["label"])
2059
 
2060
- # Cluster summary → set bucket & cluster filter
2061
  cluster_counts_df.select(
2062
  on_cluster_summary_select, [cluster_counts_df], [bucket_drop, cluster_drop]
2063
  ).then(
@@ -2069,7 +2378,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
2069
  outputs=[results_df],
2070
  )
2071
 
2072
- # Domain summary → set domain filter
2073
  domain_counts_df.select(
2074
  lambda evt, df: on_click_filter(evt, df, "from_domain", domain_drop), [domain_counts_df], [domain_drop]
2075
  ).then(
@@ -2081,7 +2389,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
2081
  outputs=[results_df],
2082
  )
2083
 
2084
- # Sender summary → set sender filter
2085
  sender_counts_df.select(
2086
  lambda evt, df: on_click_filter(evt, df, "from_email", sender_drop), [sender_counts_df], [sender_drop]
2087
  ).then(
@@ -2095,4 +2402,5 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
2095
 
2096
  if __name__ == "__main__":
2097
  # Disable SSR to avoid handler arity warnings under server-side rendering
2098
- demo.launch(ssr_mode=False)
 
 
81
  LOBBY_DOMAINS = set() # e.g., {"acme-lobby.com"}
82
  LEGAL_DOMAINS = set() # e.g., {"biglaw.com","firmlaw.com"}
83
 
84
+ def _contains_any(text: str, terms: list) -> bool:
85
  if not text or not terms: return False
86
  tl = text.lower()
87
  return any(t for t in terms if t and t.lower() in tl)
 
111
 
112
  def route_email_row(row: pd.Series) -> str:
113
  text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
114
+ scores: dict = {b: 0.0 for b in TAXONOMY.keys()}
115
  # lexicon points
116
  for b, terms in TAXONOMY.items():
117
  if not terms:
 
1194
  df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
1195
  return df.drop(columns=["_if_pct","_if_pts","_rule_pts","_corr_pts"], errors="ignore")
1196
 
1197
+ # =================== 🔧 NEW: Per-bucket k & stabilizer params ===================
1198
+ def _bucket_k_multiplier(bucket_name: str) -> float:
1199
+ b = (bucket_name or "").lower()
1200
+ if b in ("constituent",): return 1.25
1201
+ if b in ("procurement", "campaign finance", "receipts/billing", "lobbyist"): return 1.15
1202
+ if b in ("scheduling", "other"): return 1.00
1203
+ if b in ("legal",): return 0.80
1204
+ return 1.00
1205
+
1206
+ def _bucket_stabilizer_params(bucket_name: str) -> Tuple[int, float, float]:
1207
+ b = (bucket_name or "").lower()
1208
+ if b == "legal": return (30, 0.97, 0.38)
1209
+ if b == "procurement": return (35, 0.96, 0.36)
1210
+ if b == "campaign finance": return (35, 0.96, 0.36)
1211
+ if b == "constituent": return (40, 0.95, 0.33)
1212
+ if b == "receipts/billing": return (40, 0.95, 0.35)
1213
+ if b == "scheduling": return (35, 0.95, 0.35)
1214
+ return (40, 0.96, 0.35)
1215
+
1216
+ # =================== 🔧 NEW: Label de-dup helpers ===================
1217
+ def _normalize_label_tokens(label: str) -> set:
1218
+ if not label: return set()
1219
+ txt = str(label).lower()
1220
+ toks = re.findall(r"[a-z\u0590-\u05FF][a-z\u0590-\u05FF\-']{1,}", txt)
1221
+ toks2 = [t[:-1] if len(t) > 3 and t.endswith("s") else t for t in toks]
1222
+ return {t for t in toks2 if t not in STOP_TERMS and t not in EN_STOP and t not in HE_STOP and len(t) >= 2}
1223
+
1224
+ def _jaccard(a: set, b: set) -> float:
1225
+ if not a or not b: return 0.0
1226
+ inter = len(a & b)
1227
+ if inter == 0: return 0.0
1228
+ return inter / float(len(a | b))
1229
+
1230
+ def dedupe_cluster_labels_in_bucket(df: pd.DataFrame, bucket: str, sim_thresh: float = 0.72) -> pd.DataFrame:
1231
+ sel = df[df["bucket"] == bucket].copy()
1232
+ if sel.empty or "cluster_name" not in sel.columns:
1233
+ return df
1234
+
1235
+ names = sel[["cluster_id", "cluster_name"]].drop_duplicates()
1236
+ tokens = {int(cid): _normalize_label_tokens(str(name)) for cid, name in names.values}
1237
+ ids = list(tokens.keys())
1238
+
1239
+ parent = {i: i for i in ids}
1240
+ def find(i):
1241
+ while parent[i] != i: i = parent[i]
1242
+ return i
1243
+ def union(a, b):
1244
+ ra, rb = find(a), find(b)
1245
+ if ra != rb: parent[rb] = ra
1246
+
1247
+ for i in range(len(ids)):
1248
+ for j in range(i+1, len(ids)):
1249
+ if _jaccard(tokens[ids[i]], tokens[ids[j]]) >= sim_thresh:
1250
+ union(ids[i], ids[j])
1251
+
1252
+ names_map = dict(names.values)
1253
+ comp_to_canon = {}
1254
+ for cid in ids:
1255
+ root = find(cid)
1256
+ comp_to_canon.setdefault(root, [])
1257
+ comp_to_canon[root].append((cid, names_map.get(cid, "")))
1258
+
1259
+ canon_for_cluster = {}
1260
+ for root, items in comp_to_canon.items():
1261
+ best = max(items, key=lambda kv: (len(kv[1] or ""), kv[1]))
1262
+ for cid, _ in items:
1263
+ canon_for_cluster[cid] = best[1]
1264
+
1265
+ df.loc[sel.index, "cluster_name"] = sel["cluster_id"].map(lambda c: canon_for_cluster.get(int(c), names_map.get(int(c), "")))
1266
+ return df
1267
+
1268
+ def dedupe_all_labels(df: pd.DataFrame) -> pd.DataFrame:
1269
+ out = df
1270
+ for bkt in sorted(df["bucket"].dropna().unique()):
1271
+ out = dedupe_cluster_labels_in_bucket(out, bkt, sim_thresh=0.72)
1272
+ return out
1273
+
1274
+ # =================== 🔎 NEW: Surveillance-campaign detection ===================
1275
+ SURV_KEYWORDS = [
1276
+ "daily report","daily brief","briefing","sitreps","sitrep","situation report","summary",
1277
+ "dossier","monitoring","tracking","watchlist","watch list","profile","surveillance",
1278
+ "intel","intelligence","osint","open source intel","clippings","press clips","digest",
1279
+ "alert","alerting","dispatch","bulletin","roundup","update"
1280
+ ]
1281
+ SURV_RE = re.compile("|".join([re.escape(k) for k in SURV_KEYWORDS]), re.I)
1282
+
1283
+ SUBJ_NUM_RE = re.compile(r"\b\d{1,4}([,./-]\d{1,4})*\b")
1284
+ SUBJ_DATE_RE = re.compile(r"\b(?:\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?|\d{4}-\d{2}-\d{2}|jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)\b", re.I)
1285
+ SUBJ_FW_RE = re.compile(r"^\s*(re:|fw:|fwd:)\s*", re.I)
1286
+ EMAIL_RE = re.compile(r"\b[\w.\-+%]+@[\w.-]+\.[A-Za-z]{2,}\b")
1287
+
1288
+ def _candidate_entities_from_subjects(df: pd.DataFrame, extra_watchlist: List[str]) -> List[str]:
1289
+ cand = set([w.strip() for w in (extra_watchlist or []) if w.strip()])
1290
+ subs = df["subject"].dropna().astype(str).tolist()
1291
+ pat = re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\b")
1292
+ for s in subs:
1293
+ for m in pat.finditer(s):
1294
+ name = m.group(1).strip()
1295
+ if name.lower() in EN_STOP or len(name) < 5:
1296
+ continue
1297
+ cand.add(name)
1298
+ out = sorted(cand)
1299
+ return out[:3000]
1300
+
1301
+ def _normalize_subject_template(subj: str, entity: str) -> str:
1302
+ if not subj: return ""
1303
+ s = SUBJ_FW_RE.sub("", subj)
1304
+ try:
1305
+ s = re.sub(re.escape(entity), "«ENTITY»", s, flags=re.I)
1306
+ except Exception:
1307
+ pass
1308
+ s = SUBJ_DATE_RE.sub("«DATE»", s)
1309
+ s = SUBJ_NUM_RE.sub("«NUM»", s)
1310
+ s = EMAIL_RE.sub("«EMAIL»", s)
1311
+ s = re.sub(r"\s+", " ", s).strip().lower()
1312
+ return s
1313
+
1314
+ def _entity_mask_present(row: pd.Series, entity: str) -> bool:
1315
+ t = (row.get("subject","") + " " + row.get("body_text","")).lower()
1316
+ e = (entity or "").lower()
1317
+ return (e in t) if e else False
1318
+
1319
+ def detect_surveillance_campaigns(
1320
+ df: pd.DataFrame,
1321
+ watchlist: Optional[List[str]] = None,
1322
+ min_mentions: int = 15,
1323
+ ) -> Tuple[pd.DataFrame, pd.DataFrame]:
1324
+ if df.empty:
1325
+ return pd.DataFrame(), pd.DataFrame()
1326
+
1327
+ watch = [w.strip() for w in (watchlist or []) if w.strip()]
1328
+ cands = _candidate_entities_from_subjects(df, watch)
1329
+
1330
+ dfd = df.copy()
1331
+ dfd["_dt"] = pd.to_datetime(dfd["date"], utc=True, errors="coerce")
1332
+ dfd["_day"] = dfd["_dt"].dt.date
1333
+ dfd["_week"] = dfd["_dt"].dt.to_period("W").astype(str)
1334
+ dfd["_is_news"] = (dfd["bucket"] == "Newsletters/Alerts")
1335
+ dfd["_is_it"] = (dfd["bucket"] == "IT/Security")
1336
+ dfd["_is_internal"] = ~(dfd["_is_news"] | dfd["_is_it"])
1337
+ dfd["_recips"] = dfd["to_emails"].apply(lambda xs: len(xs) if isinstance(xs, list) else 0)
1338
+
1339
+ rows = []
1340
+ samples = []
1341
+
1342
+ for entity in cands:
1343
+ mask = dfd.apply(lambda r: _entity_mask_present(r, entity), axis=1)
1344
+ grp = dfd[mask]
1345
+ n = len(grp)
1346
+ if n < int(min_mentions):
1347
+ continue
1348
+
1349
+ n_senders = grp["from_email"].nunique()
1350
+ n_domains = grp["from_domain"].nunique()
1351
+ pct_news = float((grp["_is_news"].mean() if n else 0.0))
1352
+ pct_int = float((grp["_is_internal"].mean() if n else 0.0))
1353
+ avg_recips = float((grp["_recips"].mean() if n else 0.0))
1354
+
1355
+ wk = grp.groupby("_week").size().astype(float)
1356
+ if len(wk) >= 4:
1357
+ baseline = wk.iloc[:-1]
1358
+ mu = float(baseline.mean()) if len(baseline) else 0.0
1359
+ sd = float(baseline.std(ddof=1)) if len(baseline) > 1 else 0.0
1360
+ last = float(wk.iloc[-1])
1361
+ weekly_peak_z = 0.0 if sd == 0.0 else (last - mu) / sd
1362
+ else:
1363
+ weekly_peak_z = 0.0
1364
+
1365
+ norm_subj = grp["subject"].fillna("").astype(str).map(lambda s: _normalize_subject_template(s, entity))
1366
+ if len(norm_subj):
1367
+ top_template_share = norm_subj.value_counts(normalize=True).iloc[0]
1368
+ else:
1369
+ top_template_share = 0.0
1370
+
1371
+ kw_share = float(((grp["subject"].fillna("") + " " + grp["body_text"].fillna("")).str.contains(SURV_RE).mean()) if n else 0.0)
1372
+
1373
+ score = 0.0
1374
+ score += 2.5 * min(1.0, top_template_share)
1375
+ score += 2.0 * min(1.0, kw_share)
1376
+ score += 1.5 * min(1.0, weekly_peak_z / 3.0)
1377
+ score += 0.8 * min(1.0, n_senders / 10.0)
1378
+ score += 0.5 * min(1.0, n_domains / 10.0)
1379
+ score += 1.0 * pct_int
1380
+ score += 0.3 * min(1.0, avg_recips / 10.0)
1381
+
1382
+ level = "info"
1383
+ if score >= 6.5: level = "likely"
1384
+ elif score >= 4.5: level = "possible"
1385
+
1386
+ first_d = grp["_dt"].min()
1387
+ last_d = grp["_dt"].max()
1388
+
1389
+ rows.append({
1390
+ "entity": entity,
1391
+ "surveillance_score": round(float(score), 3),
1392
+ "level": level,
1393
+ "n_emails": int(n),
1394
+ "n_senders": int(n_senders),
1395
+ "n_domains": int(n_domains),
1396
+ "pct_newsletters": round(pct_news, 3),
1397
+ "pct_internal": round(pct_int, 3),
1398
+ "avg_recipients": round(avg_recips, 2),
1399
+ "weekly_peak_z": round(float(weekly_peak_z), 3),
1400
+ "template_max_share": round(float(top_template_share), 3),
1401
+ "keyword_share": round(float(kw_share), 3),
1402
+ "first_date": str(first_d) if pd.notna(first_d) else "",
1403
+ "last_date": str(last_d) if pd.notna(last_d) else "",
1404
+ "notes": "template/keywords/cadence/senders/domains mix"
1405
+ })
1406
+
1407
+ ex = grp[["date","from_email","from_domain","subject","bucket"]].copy().head(8)
1408
+ ex.insert(0, "entity", entity)
1409
+ samples.append(ex)
1410
+
1411
+ ent_df = pd.DataFrame(rows).sort_values(["surveillance_score","n_emails"], ascending=[False, False]).head(200)
1412
+ samp_df = pd.concat(samples, ignore_index=True) if samples else pd.DataFrame()
1413
+
1414
+ return ent_df, samp_df
1415
+
1416
+ def tag_surveillance_emails(df: pd.DataFrame, ent_df: pd.DataFrame, threshold: float = 4.5) -> pd.DataFrame:
1417
+ if df.empty or ent_df.empty:
1418
+ return df
1419
+ hot = ent_df[ent_df["surveillance_score"] >= float(threshold)]["entity"].tolist()
1420
+ if not hot: return df
1421
+ def _tag(row):
1422
+ txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
1423
+ tags = set(row.get("tags") or [])
1424
+ for e in hot:
1425
+ if e.lower() in txt:
1426
+ tags.add("surveillance")
1427
+ break
1428
+ return sorted(tags)
1429
+ out = df.copy()
1430
+ out["tags"] = out.apply(_tag, axis=1)
1431
+ return out
1432
+
1433
+ # =================== UI / PIPELINE CONTINUATION ===================
1434
+
1435
+ # ---------- Styles ----------
1436
  CSS = """
1437
  :root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
1438
  .email-card { background:#ffffff; color:#111827; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.08); }
 
1451
  .cursor { cursor:pointer; }
1452
  """
1453
 
1454
+ # ---------- App ----------
1455
+ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Surveillance Radar", css=CSS, theme="soft") as demo:
1456
  gr.Markdown("# Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans")
1457
+ gr.Markdown(
1458
+ "This build includes per-bucket **k** heuristics, label **de-dup**, and a **surveillance-campaign detector** "
1459
+ "(template cadence + keywords + multi-sender/domain signals)."
1460
+ )
1461
 
1462
  with gr.Row():
1463
  inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
 
1476
  use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
1477
  lsa_dim = gr.Number(label="LSA components", value=256, precision=0)
1478
  auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
1479
+ k_clusters = gr.Number(label="Base k (before per-bucket multiplier)", value=350, precision=0)
1480
  mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
1481
  with gr.Row():
1482
  use_hdbscan = gr.Checkbox(label="Use HDBSCAN (auto-k, noise) on reduced vectors", value=False)
 
1506
  sender_drop = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
1507
  lang_drop = gr.Dropdown(label="Language", choices=["(any)"], value="(any)", allow_custom_value=False)
1508
  sentiment_drop = gr.Dropdown(label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)")
1509
+ tag_drop = gr.Dropdown(label="Tag", choices=["(any)", "🚩suspect", "finance", "off-channel", "surveillance", "odd-hours", "personal-mail"], value="(any)")
1510
  with gr.Row():
1511
  date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
1512
  date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
1513
  sort_by = gr.Dropdown(label="Sort by", choices=["context_anomaly_score","corruption_score","date","anomaly_score","search_score"], value="context_anomaly_score")
1514
  sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
1515
+ with gr.Row():
1516
+ hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
1517
+
1518
+ with gr.Accordion("Surveillance Radar", open=True):
1519
+ with gr.Row():
1520
+ watchlist_in = gr.Textbox(label="Watchlist (names or entities, comma-separated)", value="Hillary Clinton, Joe Biden, Donald Trump")
1521
+ min_mentions = gr.Number(label="Min mentions per entity", value=15, precision=0)
1522
 
1523
  with gr.Row():
1524
  run_btn = gr.Button("Process", variant="primary")
 
1535
  actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
1536
  offhours_df = gr.Dataframe(label="Off-hours & personal-mail hits", interactive=False, wrap=True)
1537
 
1538
+ gr.Markdown("### Surveillance Campaigns (detected entities)")
1539
+ with gr.Row():
1540
+ surv_entities_df = gr.Dataframe(label="Entities ranked by surveillance score", interactive=False, wrap=True)
1541
+ surv_samples_df = gr.Dataframe(label="Sample emails for highlighted entities", interactive=False, wrap=True)
1542
+
1543
  gr.Markdown("### Search")
1544
  with gr.Row():
1545
  search_query = gr.Textbox(label="Search (keywords, names, etc.)")
 
1547
  results_df = gr.Dataframe(label="Results (top 500 or top 50 for search)", interactive=True, wrap=True)
1548
  email_view = gr.HTML(label="Reader")
1549
 
1550
+ # -------- State --------
1551
  state_df = gr.State()
1552
  state_vec = gr.State()
1553
  state_X_reduced = gr.State()
 
1608
  if bucket and bucket != "(any)":
1609
  out = out[out["bucket"] == bucket]
1610
  if cluster and cluster != "(any)":
 
1611
  m = re.match(r"^.*?(\-?\d+)\s+—", cluster)
1612
  if m:
1613
  cid = int(m.group(1))
 
1639
  out = out[out["cluster_id"] != -3]
1640
  return out
1641
 
1642
+ # -------- Social graph summary --------
1643
  def social_stats(df: pd.DataFrame) -> pd.DataFrame:
1644
  deg = {}
1645
  def add_edge(a,b):
 
1657
  out = pd.DataFrame(rows).sort_values("degree", ascending=False).head(50)
1658
  return out
1659
 
1660
+ # -------- Sorting helper --------
1661
+ def _sort_results(df, by, direction):
1662
+ if df is None or len(df) == 0:
1663
+ return pd.DataFrame()
1664
+ tmp = df.copy()
1665
+ if "date" in tmp.columns:
1666
+ tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
1667
+ else:
1668
+ tmp["_dt"] = pd.NaT
1669
+ by = by if by in tmp.columns else "context_anomaly_score"
1670
+ asc = (direction == "asc")
1671
+ sort_cols = [by]
1672
+ if by == "date":
1673
+ sort_cols = ["_dt"]
1674
+ elif by in ["anomaly_score", "corruption_score", "context_anomaly_score"]:
1675
+ sort_cols.append("_dt")
1676
+ tmp = tmp.sort_values(sort_cols, ascending=[asc, False])
1677
+ cols_out = [
1678
+ "date","bucket","from_email","from_domain","subject","cluster_name","lang",
1679
+ "tags","flags","sentiment","context_anomaly_score","corruption_score","anomaly_score"
1680
+ ]
1681
+ if "search_score" in tmp.columns:
1682
+ cols_out.append("search_score")
1683
+ return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
1684
+
1685
+ # -------- Vectorization helpers (mirror training path for queries) --------
1686
+ def _tokenize_query(q: str) -> List[str]:
1687
+ return [p.strip() for p in re.split(r"\s+", q or "") if p.strip()][:8]
1688
+
1689
+ def _project_query_to_lsa(q_vec, svd, norm) -> Optional[np.ndarray]:
1690
+ try:
1691
+ return norm.transform(svd.transform(q_vec)).astype(np.float32)
1692
+ except Exception:
1693
+ return None
1694
+
1695
+ def _vectorize_query(q, vec_state, corpus_texts):
1696
+ # Build the same features for the query that we used for docs
1697
+ char_min_df = 1 if len(corpus_texts) <= 1 else 2
1698
+
1699
+ if vec_state.get("use_hashing"):
1700
+ hv = HashingVectorizer(
1701
+ analyzer="word",
1702
+ ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
1703
+ n_features=2 ** vec_state.get("hash_bits", 18),
1704
+ token_pattern=TOKEN_PATTERN,
1705
+ lowercase=True,
1706
+ norm=None,
1707
+ alternate_sign=False,
1708
+ )
1709
+ counts = hv.transform(corpus_texts)
1710
+ tfidf_tr = TfidfTransformer().fit(counts)
1711
+ q_word = tfidf_tr.transform(hv.transform([q]))
1712
+ else:
1713
+ cv = CountVectorizer(
1714
+ analyzer="word",
1715
+ ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
1716
+ max_features=vec_state.get("max_features"),
1717
+ min_df=vec_state.get("min_df"),
1718
+ max_df=vec_state.get("max_df"),
1719
+ token_pattern=TOKEN_PATTERN,
1720
+ lowercase=True,
1721
+ stop_words=STOPWORD_FOR_VEC,
1722
+ dtype=np.float32,
1723
+ )
1724
+ tf = cv.fit_transform(corpus_texts)
1725
+ bm25 = BM25Transformer().fit(tf)
1726
+ q_word = bm25.transform(cv.transform([q]))
1727
+
1728
+ char_vec = CharTfidf(
1729
+ analyzer="char", ngram_range=(3, 5), min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32
1730
+ ).fit(corpus_texts)
1731
+ q_char = char_vec.transform([q])
1732
+
1733
+ return hstack([q_word, q_char * 0.20], format="csr")
1734
+
1735
  # -------- Main pipeline --------
1736
  def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
1737
  use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
1738
  trusted_domains_in, extra_keywords_in, highlight_toggle,
 
1739
  use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
1740
+ per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
1741
+ watchlist_in, min_mentions):
1742
  if inbox_file is None:
1743
+ return (
1744
+ "**Please upload a file.**",
1745
+ None, None, None, None, None,
1746
+ None, None, # surveillance outputs
1747
+ None, # results_df
1748
+ None, None, None, # states df/vec/X
1749
+ None, None, # index & term names
1750
+ None, None, # flags
1751
+ gr.update(), gr.update(), gr.update(), gr.update(), # dropdowns
1752
+ None, None, None, # svd/norm/dims
1753
+ None, None, # extra terms / highlight
1754
+ gr.update() # bucket list
1755
+ )
1756
 
1757
+ # === Inner helpers for this function ===
1758
  def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
1759
  texts = list(df_in.apply(enrich_text, axis=1))
1760
  subjects_only = list(df_in["subject"].fillna(""))
 
1853
 
1854
  def _cluster_space(
1855
  X_space,
1856
+ bucket_name: str,
1857
  df_part: pd.DataFrame,
1858
  use_lsa: bool,
1859
  use_hdbscan: bool,
 
1869
  ):
1870
  n = X_space.shape[0]
1871
 
1872
+ # Per-bucket stabilizer params
1873
+ min_size, merge_th, reassign_th = _bucket_stabilizer_params(bucket_name)
1874
+
1875
  if n <= 1:
1876
  labels = np.zeros((n,), dtype=int) if n == 1 else np.array([], dtype=int)
1877
  centers = None
1878
  chosen_k = int(n) if n > 0 else 0
1879
+ return stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th), centers, chosen_k
1880
+
1881
  if n < 10:
1882
  k_small = min(max(2, n // 2), n)
1883
  kmeans = MiniBatchKMeans(
 
1888
  )
1889
  labels = kmeans.fit_predict(X_space)
1890
  centers = getattr(kmeans, "cluster_centers_", None)
1891
+ labels = stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th)
1892
  return labels, centers, int(len(set(labels)))
1893
 
1894
  if use_hdbscan and HDBSCAN_OK and isinstance(X_space, np.ndarray) and X_space.shape[0] >= max(50, hdb_min_cluster):
 
1902
  )
1903
  labels = clusterer.fit_predict(X_space)
1904
  centers = None
1905
+ labels = stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th)
1906
  chosen_k = int(len(set([l for l in labels if l >= 0])))
1907
  return labels, centers, chosen_k
1908
 
1909
+ # Choose k (global rule or kneedle), then per-bucket multiplier
1910
  if bool(auto_k):
1911
  if use_lsa and isinstance(X_space, np.ndarray):
1912
  k, _ = choose_k_by_kneedle(X_space, ks=(50, 100, 150, 200, 300, 400, 500))
 
1914
  k = auto_k_rule(X_space.shape[0])
1915
  else:
1916
  k = max(10, int(k_clusters or 350))
1917
+ k = int(max(2, round(k * _bucket_k_multiplier(bucket_name))))
1918
 
1919
  init = None
1920
  if use_lsa and isinstance(X_space, np.ndarray) and count_vec is not None:
 
1936
  centers = kmeans.cluster_centers_ if hasattr(kmeans, "cluster_centers_") else None
1937
  if use_lsa and centers is not None:
1938
  labels = merge_close_clusters(labels, centers, thresh=0.95)
1939
+ labels = stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th)
1940
  chosen_k = int(len(set(labels)))
1941
  return labels, centers, chosen_k
1942
 
1943
+ # ---- Begin processing ----
1944
  trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
1945
  extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
1946
  extra_terms_lower = [t.lower() for t in extra_terms]
 
1948
  recs = _load_json_records(inbox_file.name)
1949
  if not recs:
1950
  return ("**No valid records found.**",
1951
+ None, None, None, None, None,
1952
+ None, None,
1953
+ None,
1954
+ None, None, None,
1955
+ None, None,
1956
+ None, None, None, None,
1957
+ None, None, None,
1958
+ None, None,
1959
+ None)
1960
 
1961
  normd = []
1962
  for r in tqdm(recs, desc="Normalize", leave=False):
 
1966
  df = pd.DataFrame(normd)
1967
  if df.empty:
1968
  return ("**No usable email records.**",
1969
+ None, None, None, None, None,
1970
+ None, None,
1971
+ None,
1972
+ None, None, None,
1973
+ None, None,
1974
+ None, None, None, None,
1975
+ None, None, None,
1976
+ None, None,
1977
+ None)
1978
 
1979
  df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
1980
  df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
1981
  df = compute_sentiment_column(df)
1982
 
1983
+ # Stage-1 routing (bucketing)
1984
  df["bucket"] = df.apply(route_email_row, axis=1)
1985
  df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_domain", "")), axis=1)
1986
  df["is_notify"] = df.apply(lambda r: is_notification_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_email", ""), r.get("from_domain", "")), axis=1)
1987
  df.loc[df["is_news"] == True, "bucket"] = "Newsletters/Alerts"
1988
  df.loc[df["is_notify"] == True, "bucket"] = "IT/Security"
1989
 
1990
+ # Flags
1991
  flags = []
1992
  for _, row in df.iterrows():
1993
  f = []
 
2001
  flags.append(f)
2002
  df["flags"] = flags
2003
 
2004
+ # Split out stable buckets
2005
  df_main = df[~df["bucket"].isin(["Newsletters/Alerts", "IT/Security"])].reset_index(drop=True)
2006
  df_news = df[df["bucket"] == "Newsletters/Alerts"].reset_index(drop=True)
2007
  df_alerts = df[df["bucket"] == "IT/Security"].reset_index(drop=True)
2008
 
2009
+ # Optional embeddings
2010
  kv = None
2011
  emb_dim = 0
2012
  if bool(use_embeddings):
2013
  kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
2014
 
2015
+ # Build partitions: per language within bucket if requested
2016
  parts = []
2017
  if bool(per_language):
2018
  for bkt, g_bucket in df_main.groupby("bucket", dropna=False):
 
2023
  parts.append(((bkt, "all"), grp.copy()))
2024
 
2025
  labels_list, cluster_name_list, anomaly_list = [], [], []
2026
+ bucket_indexers = []
2027
  X_reduced_holder = None
2028
  term_names_global = {}
2029
  single_partition = (len(parts) == 1)
2030
+ d_word_agg, d_char_agg = 0, 0
2031
  svd_obj_local, norm_obj_local = None, None
2032
 
2033
  for (bucket_name, _lang), df_part in parts:
 
2063
  except Exception:
2064
  pass
2065
 
2066
+ labels, _, _ = _cluster_space(
2067
  X_space=X_space,
2068
+ bucket_name=bucket_name,
2069
  df_part=df_part,
2070
  use_lsa=bool(use_lsa) and X_reduced is not None,
2071
  use_hdbscan=bool(use_hdbscan),
 
2080
  d_word=d_word,
2081
  d_char=d_char,
2082
  )
 
 
2083
 
2084
  term_names = cluster_labels_pmi_bigram(
2085
  texts=texts, labels=labels, subjects=subjects_only,
2086
  topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
2087
  )
2088
 
 
2089
  bucket_indexers.append(df_part.index)
2090
  labels_list.append(pd.Series(labels, index=df_part.index))
2091
  cluster_name_list.append(pd.Series([term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels], index=df_part.index))
2092
  anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
 
2093
  term_names_global.update({int(k): v for k, v in term_names.items()})
2094
 
2095
  if single_partition and X_reduced is not None:
2096
  X_reduced_holder = X_reduced
2097
+
2098
  if labels_list:
2099
  df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_id"] = pd.concat(labels_list).sort_index()
2100
  df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_name"] = pd.concat(cluster_name_list).sort_index()
 
2104
  df_main["cluster_name"] = "unclustered"
2105
  df_main["anomaly_score"] = np.nan
2106
 
2107
+ # Assign fixed ids for news/alerts buckets
2108
  if len(df_news):
2109
  df_news.loc[:, "cluster_id"] = -1
2110
  df_news.loc[:, "cluster_name"] = "newsletter/news"
 
2114
  df_alerts.loc[:, "cluster_name"] = "system/alerts"
2115
  df_alerts.loc[:, "anomaly_score"] = np.nan
2116
 
2117
+ # Merge back
2118
  df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
2119
+
2120
+ # Label de-dup pass per-bucket
2121
+ df = dedupe_all_labels(df)
2122
+
2123
+ # Scores
2124
  df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
2125
  df = compute_context_anomaly(df)
2126
 
2127
+ # Surveillance campaigns
2128
+ wl = [w.strip() for w in (watchlist_in or "").split(",") if w.strip()]
2129
+ ent_df, samp_df = detect_surveillance_campaigns(df, watchlist=wl, min_mentions=int(min_mentions or 15))
2130
+ df = tag_surveillance_emails(df, ent_df, threshold=4.5)
2131
+
2132
+ # Build indexes/search
2133
  index_obj = None
2134
  use_faiss_flag = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
2135
  if use_faiss_flag:
 
2146
  except Exception:
2147
  pass
2148
 
2149
+ # Summaries
2150
  cluster_counts = (
2151
  df.groupby(["bucket", "cluster_id", "cluster_name"])
2152
  .size()
 
2193
  norm_obj_out = norm_obj_local if single_partition else None
2194
 
2195
  return (
2196
+ status_md, # status
2197
+ cluster_counts, domain_counts, sender_counts, # summaries
2198
+ actors, offhours_table, # extra summaries
2199
+ ent_df, samp_df, # surveillance tables
2200
+ out_table, # results table
2201
+ df, vec_state, X_reduced_holder, # states
2202
+ index_obj, term_names_global, # index + labels
2203
+ bool(use_lsa), use_faiss_flag, # flags
2204
  gr.update(choices=cluster_choices, value="(any)"),
2205
  gr.update(choices=domain_choices, value="(any)"),
2206
  gr.update(choices=sender_choices, value="(any)"),
 
2210
  gr.update(choices=bucket_choices, value="(any)")
2211
  )
2212
 
2213
+ # Bind Process button
2214
  (run_btn.click)(
2215
  process_file,
2216
  inputs=[
 
2219
  trusted_domains_in, extra_keywords_in, highlight_toggle,
2220
  use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
2221
  per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
2222
+ watchlist_in, min_mentions
2223
  ],
2224
  outputs=[
2225
  status, cluster_counts_df, domain_counts_df, sender_counts_df,
2226
+ actors_df, offhours_df,
2227
+ surv_entities_df, surv_samples_df,
2228
+ results_df,
2229
+ state_df, state_vec, state_X_reduced,
2230
+ state_index, state_term_names,
2231
  state_use_lsa, state_use_faiss,
2232
  cluster_drop, domain_drop, sender_drop, lang_drop,
2233
+ state_svd, state_norm, state_dims,
2234
+ state_extra_terms, state_highlight,
2235
  bucket_drop,
2236
  ],
2237
  )
2238
 
2239
  # -------- Filtering & Search --------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2240
  def refresh_results(df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
2241
  if df is None or len(df) == 0:
2242
  return pd.DataFrame()
 
2245
  )
2246
  return _sort_results(filt, sort_by, sort_dir)
2247
 
2248
+ # Re-run when any filter control changes
2249
  for ctrl in [bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
2250
  date_start, date_end, sort_by, sort_dir, hide_noise]:
2251
  ctrl.change(
 
2272
  outputs=[results_df],
2273
  )
2274
 
2275
+ # --- Search ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2276
  def search_fn(q, df, vec, X_red, index, use_lsa, use_faiss, svd, norm, sort, sdir):
2277
  if not q or df is None or vec is None or index is None:
2278
  return pd.DataFrame(), []
 
2295
  return pd.DataFrame(), q_terms
2296
 
2297
  if isinstance(index, NearestNeighbors):
 
2298
  if hasattr(index, "n_samples_fit_") and index.n_samples_fit_ <= 1:
2299
  return pd.DataFrame(), q_terms
2300
  dists, inds = index.kneighbors(q_emb, n_neighbors=n_req)
 
2319
  outputs=[results_df, state_query_terms],
2320
  )
2321
 
2322
+ # --- Reader selection (highlighting) ---
2323
  def on_row_select(evt: gr.SelectData, table, df, term_names, q_terms, extra_terms, do_highlight):
2324
  if evt.index is None or table is None or len(table) == 0 or df is None or len(df) == 0:
2325
  return ""
 
2354
  outputs=[email_view],
2355
  )
2356
 
2357
+ # --- Click-to-filter helpers ---
2358
  def on_click_filter(evt: gr.SelectData, df_sum: pd.DataFrame, col_name: str, out_comp: gr.Dropdown):
2359
  if evt.index is None or df_sum is None or df_sum.empty:
2360
  return gr.update()
 
2367
  r = df_sum.iloc[evt.index[0]]
2368
  return gr.update(value=r["bucket"]), gr.update(value=r["label"])
2369
 
 
2370
  cluster_counts_df.select(
2371
  on_cluster_summary_select, [cluster_counts_df], [bucket_drop, cluster_drop]
2372
  ).then(
 
2378
  outputs=[results_df],
2379
  )
2380
 
 
2381
  domain_counts_df.select(
2382
  lambda evt, df: on_click_filter(evt, df, "from_domain", domain_drop), [domain_counts_df], [domain_drop]
2383
  ).then(
 
2389
  outputs=[results_df],
2390
  )
2391
 
 
2392
  sender_counts_df.select(
2393
  lambda evt, df: on_click_filter(evt, df, "from_email", sender_drop), [sender_counts_df], [sender_drop]
2394
  ).then(
 
2402
 
2403
  if __name__ == "__main__":
2404
  # Disable SSR to avoid handler arity warnings under server-side rendering
2405
+ demo.launch(ssr_mode=False)
2406
+