wuhp commited on
Commit
186377a
·
verified ·
1 Parent(s): abfac2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -206
app.py CHANGED
@@ -64,6 +64,74 @@ try:
64
  except Exception:
65
  VADER_OK = False
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  # =================== Regex & Flags ===================
68
  TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
69
  URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
@@ -1082,6 +1150,50 @@ def corruption_score(row, trusted_domains: set):
1082
  score += 0.3
1083
  return score
1084
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1085
  # =================== Gradio UI ===================
1086
  CSS = """
1087
  :root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
@@ -1145,6 +1257,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1145
  embeddings_path = gr.Textbox(label="Path to local embeddings (.txt/.vec/.bin) (optional)", value="")
1146
  embeddings_binary = gr.Checkbox(label="File is binary word2vec format", value=False)
1147
  with gr.Row():
 
1148
  cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
1149
  domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
1150
  sender_drop = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
@@ -1154,7 +1267,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1154
  with gr.Row():
1155
  date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
1156
  date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
1157
- sort_by = gr.Dropdown(label="Sort by", choices=["corruption_score","date","anomaly_score","search_score"], value="corruption_score")
1158
  sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
1159
  # NEW: hide noise toggle
1160
  hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
@@ -1227,6 +1340,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1227
 
1228
  def _apply_filters(
1229
  df: pd.DataFrame,
 
1230
  cluster: Optional[str],
1231
  domain: Optional[str],
1232
  sender: Optional[str],
@@ -1235,11 +1349,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1235
  tag_value: str,
1236
  start: str,
1237
  end: str,
1238
- hide_noise_flag: bool = False, # NEW
1239
  ) -> pd.DataFrame:
1240
  out = df
 
 
1241
  if cluster and cluster != "(any)":
1242
- m = re.match(r"^(\-?\d+)\s+—", cluster)
 
1243
  if m:
1244
  cid = int(m.group(1))
1245
  out = out[out["cluster_id"] == cid]
@@ -1298,7 +1415,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1298
  if inbox_file is None:
1299
  return ("**Please upload a file.**",
1300
  None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1301
- None, None, None, None)
1302
 
1303
  # === Vectorization & Clustering (UPGRADED) ===
1304
  def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
@@ -1484,7 +1601,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1484
  if not recs:
1485
  return ("**No valid records found.**",
1486
  None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1487
- None, None, None, None)
1488
 
1489
  normd = []
1490
  for r in tqdm(recs, desc="Normalize", leave=False):
@@ -1495,12 +1612,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1495
  if df.empty:
1496
  return ("**No usable email records.**",
1497
  None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1498
- None, None, None, None)
1499
 
1500
  df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
1501
  df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
1502
  df = compute_sentiment_column(df)
1503
 
 
 
 
 
 
 
 
1504
  flags = []
1505
  for _, row in df.iterrows():
1506
  f = []
@@ -1514,12 +1638,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1514
  flags.append(f)
1515
  df["flags"] = flags
1516
 
1517
- df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_domain", "")), axis=1)
1518
- df["is_notify"] = df.apply(lambda r: is_notification_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_email", ""), r.get("from_domain", "")), axis=1)
1519
-
1520
- df_main = df[~(df["is_news"] | df["is_notify"])].reset_index(drop=True)
1521
- df_news = df[df["is_news"]].reset_index(drop=True)
1522
- df_alerts = df[df["is_notify"]].reset_index(drop=True)
1523
 
1524
  kv = None
1525
  emb_dim = 0
@@ -1527,20 +1648,23 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1527
  kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
1528
 
1529
  parts = []
1530
- if bool(per_language) and "lang" in df_main.columns:
1531
- for lang_code, grp in df_main.groupby("lang", dropna=False):
1532
- parts.append((lang_code, grp.copy()))
 
1533
  else:
1534
- parts = [("all", df_main.copy())]
 
1535
 
1536
  labels_list, cluster_name_list, anomaly_list = [], [], []
 
1537
  X_reduced_holder = None
1538
  term_names_global = {}
1539
  single_partition = (len(parts) == 1)
1540
  d_word_agg, d_char_agg, k_agg = 0, 0, 0
1541
  svd_obj_local, norm_obj_local = None, None
1542
 
1543
- for p_lang, df_part in parts:
1544
  if df_part.empty:
1545
  continue
1546
  texts, subjects_only = _make_texts(df_part)
@@ -1589,38 +1713,29 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1589
  d_word=d_word,
1590
  d_char=d_char,
1591
  )
1592
- # NEW: stabilize per partition
1593
- labels = stabilize_labels(
1594
- X_space, labels,
1595
- min_size=40,
1596
- merge_thresh=0.96,
1597
- reassign_thresh=0.35,
1598
- )
1599
-
1600
  k_agg += len(set(labels))
1601
 
1602
  term_names = cluster_labels_pmi_bigram(
1603
  texts=texts, labels=labels, subjects=subjects_only,
1604
  topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
1605
  )
1606
- term_names_global.update({int(k): v for k, v in term_names.items()})
1607
 
 
 
1608
  labels_list.append(pd.Series(labels, index=df_part.index))
1609
- cluster_name_list.append(
1610
- pd.Series(
1611
- [term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels],
1612
- index=df_part.index,
1613
- )
1614
- )
1615
  anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
 
 
1616
 
1617
  if single_partition and X_reduced is not None:
1618
  X_reduced_holder = X_reduced
1619
-
1620
  if labels_list:
1621
- df_main["cluster_id"] = pd.concat(labels_list).sort_index()
1622
- df_main["cluster_name"] = pd.concat(cluster_name_list).sort_index()
1623
- df_main["anomaly_score"] = pd.concat(anomaly_list).sort_index()
1624
  else:
1625
  df_main["cluster_id"] = -10
1626
  df_main["cluster_name"] = "unclustered"
@@ -1637,6 +1752,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1637
 
1638
  df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
1639
  df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
 
1640
 
1641
  index_obj = None
1642
  use_faiss_flag = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
@@ -1655,16 +1771,18 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1655
  pass
1656
 
1657
  cluster_counts = (
1658
- df.groupby(["cluster_id", "cluster_name"])
1659
- .size()
1660
- .reset_index(name="count")
1661
- .sort_values("count", ascending=False)
1662
- .head(500)
1663
  )
1664
  cluster_counts["label"] = cluster_counts.apply(
1665
- lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
1666
  )
1667
  cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
 
 
1668
  domain_counts = (
1669
  df.groupby("from_domain").size().reset_index(name="count").sort_values("count", ascending=False).head(100)
1670
  )
@@ -1682,7 +1800,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1682
  .sort_values("corruption_score", ascending=False)
1683
  .head(200)
1684
  )
1685
- out_table = _sort_results(df, "corruption_score", "desc")
1686
 
1687
  vec_state = {
1688
  "use_hashing": bool(use_hashing),
@@ -1698,85 +1816,35 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1698
  norm_obj_out = norm_obj_local if single_partition else None
1699
 
1700
  return (
1701
- status_md,
1702
- cluster_counts,
1703
- domain_counts,
1704
- sender_counts,
1705
- actors,
1706
- offhours_table,
1707
- out_table,
1708
- df,
1709
- vec_state,
1710
- X_reduced_holder,
1711
- index_obj,
1712
- term_names_global,
1713
- bool(use_lsa),
1714
- use_faiss_flag,
1715
  gr.update(choices=cluster_choices, value="(any)"),
1716
- gr.update(choices=domain_choices, value="(any)"),
1717
- gr.update(choices=sender_choices, value="(any)"),
1718
  gr.update(choices=lang_choices, value="(any)"),
1719
- svd_obj_out,
1720
- norm_obj_out,
1721
- (d_word_agg, d_char_agg),
1722
- extra_terms_lower,
1723
- bool(highlight_toggle),
1724
  )
1725
 
1726
  (run_btn.click)(
1727
  process_file,
1728
  inputs=[
1729
- inbox_file,
1730
- max_features,
1731
- min_df,
1732
- max_df,
1733
- use_bigrams,
1734
- skip_lang,
1735
- use_lsa,
1736
- lsa_dim,
1737
- auto_k,
1738
- k_clusters,
1739
- mb_batch,
1740
- use_faiss,
1741
- use_iso,
1742
- trusted_domains_in,
1743
- extra_keywords_in,
1744
- highlight_toggle,
1745
- use_hashing,
1746
- hash_bits,
1747
- use_hdbscan,
1748
- hdb_min_cluster,
1749
- hdb_min_samples,
1750
- per_language,
1751
- use_embeddings,
1752
- embed_weight,
1753
- embeddings_path,
1754
- embeddings_binary,
1755
  ],
1756
  outputs=[
1757
- status,
1758
- cluster_counts_df,
1759
- domain_counts_df,
1760
- sender_counts_df,
1761
- actors_df,
1762
- offhours_df,
1763
- results_df,
1764
- state_df,
1765
- state_vec,
1766
- state_X_reduced,
1767
- state_index,
1768
- state_term_names,
1769
- state_use_lsa,
1770
- state_use_faiss,
1771
- cluster_drop,
1772
- domain_drop,
1773
- sender_drop,
1774
- lang_drop,
1775
- state_svd,
1776
- state_norm,
1777
- state_dims,
1778
- state_extra_terms,
1779
- state_highlight,
1780
  ],
1781
  )
1782
 
@@ -1789,85 +1857,56 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1789
  tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
1790
  else:
1791
  tmp["_dt"] = pd.NaT
1792
- by = by if by in tmp.columns else "corruption_score"
1793
  asc = (direction == "asc")
1794
  sort_cols = [by]
1795
  if by == "date":
1796
  sort_cols = ["_dt"]
1797
- elif by in ["anomaly_score", "corruption_score"]:
1798
  sort_cols.append("_dt")
1799
 
1800
  tmp = tmp.sort_values(sort_cols, ascending=[asc, False])
1801
 
1802
  cols_out = [
1803
- "date",
1804
- "from_email",
1805
- "from_domain",
1806
- "subject",
1807
- "cluster_name",
1808
- "lang",
1809
- "tags",
1810
- "flags",
1811
- "sentiment",
1812
- "corruption_score",
1813
- "anomaly_score",
1814
  ]
1815
  if "search_score" in tmp.columns:
1816
  cols_out.append("search_score")
1817
 
1818
  return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
1819
 
1820
- def refresh_results(df, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
1821
  if df is None or len(df) == 0:
1822
  return pd.DataFrame()
1823
  filt = _apply_filters(
1824
- df, cluster, domain, sender, lang, sentiment, tag, start, end, hide_noise_flag=bool(hide_noise_flag)
1825
  )
1826
  return _sort_results(filt, sort_by, sort_dir)
1827
 
1828
  # Re-run when any filter control changes (including hide_noise)
1829
- for ctrl in [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
1830
  date_start, date_end, sort_by, sort_dir, hide_noise]:
1831
  ctrl.change(
1832
  refresh_results,
1833
  inputs=[
1834
- state_df,
1835
- cluster_drop,
1836
- domain_drop,
1837
- sender_drop,
1838
- lang_drop,
1839
- sentiment_drop,
1840
- tag_drop,
1841
- date_start,
1842
- date_end,
1843
- sort_by,
1844
- sort_dir,
1845
- hide_noise,
1846
  ],
1847
  outputs=[results_df],
1848
  )
1849
 
1850
- # Reset filters (sets selects to (any), dates blank, sort default, and hide_noise= True)
1851
  reset_btn.click(
1852
- lambda: ["(any)"] * 6 + [""] * 2 + ["corruption_score", "desc"] + [True],
1853
  [],
1854
- [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
1855
  date_start, date_end, sort_by, sort_dir, hide_noise],
1856
  ).then(
1857
  refresh_results,
1858
  inputs=[
1859
- state_df,
1860
- cluster_drop,
1861
- domain_drop,
1862
- sender_drop,
1863
- lang_drop,
1864
- sentiment_drop,
1865
- tag_drop,
1866
- date_start,
1867
- date_end,
1868
- sort_by,
1869
- sort_dir,
1870
- hide_noise,
1871
  ],
1872
  outputs=[results_df],
1873
  )
@@ -1964,17 +2003,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1964
  search_btn.click(
1965
  search_fn,
1966
  inputs=[
1967
- search_query,
1968
- state_df,
1969
- state_vec,
1970
- state_X_reduced,
1971
- state_index,
1972
- state_use_lsa,
1973
- state_use_faiss,
1974
- state_svd,
1975
- state_norm,
1976
- sort_by,
1977
- sort_dir,
1978
  ],
1979
  outputs=[results_df, state_query_terms],
1980
  )
@@ -2003,7 +2033,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
2003
  return build_highlighted_html(
2004
  row,
2005
  query_terms=q_terms,
2006
- cluster_label=clabel,
2007
  do_highlight=do_highlight,
2008
  extra_terms=extra_terms,
2009
  )
@@ -2020,25 +2050,21 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
2020
  return gr.update()
2021
  val = df_sum.iloc[evt.index[0]][col_name]
2022
  return gr.update(value=val)
 
 
 
 
 
 
2023
 
2024
- # Cluster summary → set cluster filter
2025
  cluster_counts_df.select(
2026
- lambda evt, df: on_click_filter(evt, df, "label", cluster_drop), [cluster_counts_df], [cluster_drop]
2027
  ).then(
2028
  refresh_results,
2029
  inputs=[
2030
- state_df,
2031
- cluster_drop,
2032
- domain_drop,
2033
- sender_drop,
2034
- lang_drop,
2035
- sentiment_drop,
2036
- tag_drop,
2037
- date_start,
2038
- date_end,
2039
- sort_by,
2040
- sort_dir,
2041
- hide_noise,
2042
  ],
2043
  outputs=[results_df],
2044
  )
@@ -2049,18 +2075,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
2049
  ).then(
2050
  refresh_results,
2051
  inputs=[
2052
- state_df,
2053
- cluster_drop,
2054
- domain_drop,
2055
- sender_drop,
2056
- lang_drop,
2057
- sentiment_drop,
2058
- tag_drop,
2059
- date_start,
2060
- date_end,
2061
- sort_by,
2062
- sort_dir,
2063
- hide_noise,
2064
  ],
2065
  outputs=[results_df],
2066
  )
@@ -2071,18 +2087,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
2071
  ).then(
2072
  refresh_results,
2073
  inputs=[
2074
- state_df,
2075
- cluster_drop,
2076
- domain_drop,
2077
- sender_drop,
2078
- lang_drop,
2079
- sentiment_drop,
2080
- tag_drop,
2081
- date_start,
2082
- date_end,
2083
- sort_by,
2084
- sort_dir,
2085
- hide_noise,
2086
  ],
2087
  outputs=[results_df],
2088
  )
 
64
  except Exception:
65
  VADER_OK = False
66
 
67
+ # ======== STAGE-1 TAXONOMY (Buckets) ========
68
+ TAXONOMY = {
69
+ "Lobbyist": ["lobby","lobbyist","pac","influence"],
70
+ "Campaign Finance": ["donation","contribution","fundraiser","pledge","campaign finance","pac"],
71
+ "Procurement": ["contract","tender","rfp","rfq","bid","invoice","vendor","purchase order","po"],
72
+ "HR/Admin": ["hiring","personnel","payroll","benefits","policy","vacation","pto"],
73
+ "Constituent": ["constituent","concerned citizen","my issue","complaint","community"],
74
+ "Scheduling": ["schedule","meeting","appointment","calendar","invite","availability","reschedule"],
75
+ "Legal": ["legal","lawsuit","attorney","counsel","privileged","court","subpoena","confidential"],
76
+ "IT/Security": ["password","account security","two-factor","2fa","vpn","verification code","security alert","it support"],
77
+ "Newsletters/Alerts": ["newsletter","daily briefing","news update","unsubscribe","press clip","digest"],
78
+ "Other": [],
79
+ }
80
+ # header/domain cues (expand as you learn)
81
+ LOBBY_DOMAINS = set() # e.g., {"acme-lobby.com"}
82
+ LEGAL_DOMAINS = set() # e.g., {"biglaw.com","firmlaw.com"}
83
+
84
+ def _contains_any(text: str, terms: list[str]) -> bool:
85
+ if not text or not terms: return False
86
+ tl = text.lower()
87
+ return any(t for t in terms if t and t.lower() in tl)
88
+
89
+ def _bucket_header_bonus(row: pd.Series, bucket: str) -> float:
90
+ fd = (row.get("from_domain") or "").lower()
91
+ subj = (row.get("subject") or "")
92
+ if bucket == "Newsletters/Alerts":
93
+ return 5.0 if is_news_like(subj, row.get("body_text",""), fd) else 0.0
94
+ if bucket == "IT/Security":
95
+ return 5.0 if is_notification_like(subj, row.get("body_text",""), row.get("from_email",""), fd) else 0.0
96
+ if bucket == "Constituent":
97
+ # personal mail to public office is a strong hint
98
+ return 3.0 if (fd in PERSONAL_DOMAINS) else 0.0
99
+ if bucket == "Lobbyist":
100
+ return 5.0 if fd in LOBBY_DOMAINS else 0.0
101
+ if bucket == "Legal":
102
+ return 5.0 if (("law" in fd) or (fd in LEGAL_DOMAINS) or ("privileged" in subj.lower())) else 0.0
103
+ if bucket == "Scheduling":
104
+ # ICS invite or explicit invite subject
105
+ body = (row.get("body_text") or "")
106
+ return 3.0 if (ATTACH_NAME_RE.search(" ".join(row.get("attachments") or [])) or re.search(r"\binvitation\b|\binvite\b", subj, re.I) or re.search(r"\.ics\b", body, re.I)) else 0.0
107
+ return 0.0
108
+
109
+ MIN_ROUTE_SCORE = 1.5 # at least ~2 weak signals or one strong
110
+ TIE_MARGIN = 1.0
111
+
112
+ def route_email_row(row: pd.Series) -> str:
113
+ text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
114
+ scores: dict[str,float] = {b: 0.0 for b in TAXONOMY.keys()}
115
+ # lexicon points
116
+ for b, terms in TAXONOMY.items():
117
+ if not terms:
118
+ continue
119
+ # count unique term hits to avoid over-crediting repeats
120
+ hits = sum(1 for t in terms if t and t.lower() in text)
121
+ scores[b] += float(hits)
122
+ # strong phrases in your corruption lexicon can hint Lobbyist/Procurement
123
+ if b in ("Lobbyist","Procurement") and any(p in text for p in SUSPECT_PHRASES):
124
+ scores[b] += 1.0
125
+ # header bonuses
126
+ for b in TAXONOMY.keys():
127
+ scores[b] += _bucket_header_bonus(row, b)
128
+ # choose
129
+ best_bucket, best = max(scores.items(), key=lambda kv: kv[1])
130
+ second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
131
+ if best < MIN_ROUTE_SCORE or (best - second) < TIE_MARGIN:
132
+ return "Other"
133
+ return best_bucket
134
+
135
  # =================== Regex & Flags ===================
136
  TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
137
  URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
 
1150
  score += 0.3
1151
  return score
1152
 
1153
+ def compute_context_anomaly(df_in: pd.DataFrame) -> pd.DataFrame:
1154
+ if df_in.empty:
1155
+ df_in["context_anomaly_score"] = 0.0
1156
+ return df_in
1157
+
1158
+ # 1) IsolationForest percentile -> 0–6 (you already computed anomaly_score per partition; lower is “more anomalous” if using score_samples with sign reversed above)
1159
+ df = df_in.copy()
1160
+ if "anomaly_score" in df.columns:
1161
+ # higher = more anomalous in your current pipeline (you negated score_samples). Convert to percentile per bucket.
1162
+ df["_if_pct"] = 0.0
1163
+ for bkt, grp in df.groupby("bucket", dropna=False):
1164
+ vals = grp["anomaly_score"].astype(float)
1165
+ if vals.notna().sum() >= 5:
1166
+ ranks = vals.rank(pct=True, ascending=False) # top anomaly gets 1.0
1167
+ df.loc[grp.index, "_if_pct"] = ranks.clip(0, 1)
1168
+ df["_if_pts"] = (df["_if_pct"] * 6.0).clip(0, 6)
1169
+ else:
1170
+ df["_if_pts"] = 0.0
1171
+
1172
+ # 2) Rule violations per bucket (0–2)
1173
+ df["_rule_pts"] = 0.0
1174
+ low = (df["subject"].fillna("") + " " + df["body_text"].fillna("")).str.lower()
1175
+ for bkt, terms in TAXONOMY.items():
1176
+ mask = (df["bucket"] == bkt)
1177
+ if not mask.any():
1178
+ continue
1179
+ if terms:
1180
+ has_term = low.str.contains("|".join([re.escape(t.lower()) for t in terms]), regex=True)
1181
+ df.loc[mask & (~has_term), "_rule_pts"] += 1.0
1182
+ # header expectation examples:
1183
+ if bkt == "Constituent":
1184
+ df.loc[mask & (~df["from_domain"].str.lower().isin(PERSONAL_DOMAINS)), "_rule_pts"] += 1.0
1185
+ if bkt == "Scheduling":
1186
+ subj = df.loc[mask, "subject"].fillna("").str.lower()
1187
+ df.loc[mask & (~subj.str.contains(r"\bmeeting|invite|schedule|calendar\b", regex=True)), "_rule_pts"] += 1.0
1188
+
1189
+ df["_rule_pts"] = df["_rule_pts"].clip(0, 2)
1190
+
1191
+ # 3) Corruption heuristics capped to 0–3
1192
+ df["_corr_pts"] = df["corruption_score"].fillna(0).clip(0, 3)
1193
+
1194
+ df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
1195
+ return df.drop(columns=["_if_pct","_if_pts","_rule_pts","_corr_pts"], errors="ignore")
1196
+
1197
  # =================== Gradio UI ===================
1198
  CSS = """
1199
  :root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
 
1257
  embeddings_path = gr.Textbox(label="Path to local embeddings (.txt/.vec/.bin) (optional)", value="")
1258
  embeddings_binary = gr.Checkbox(label="File is binary word2vec format", value=False)
1259
  with gr.Row():
1260
+ bucket_drop = gr.Dropdown(label="Bucket", choices=["(any)"] + list(TAXONOMY.keys()), value="(any)", allow_custom_value=False)
1261
  cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
1262
  domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
1263
  sender_drop = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
 
1267
  with gr.Row():
1268
  date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
1269
  date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
1270
+ sort_by = gr.Dropdown(label="Sort by", choices=["context_anomaly_score","corruption_score","date","anomaly_score","search_score"], value="context_anomaly_score")
1271
  sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
1272
  # NEW: hide noise toggle
1273
  hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
 
1340
 
1341
  def _apply_filters(
1342
  df: pd.DataFrame,
1343
+ bucket: Optional[str],
1344
  cluster: Optional[str],
1345
  domain: Optional[str],
1346
  sender: Optional[str],
 
1349
  tag_value: str,
1350
  start: str,
1351
  end: str,
1352
+ hide_noise_flag: bool = False,
1353
  ) -> pd.DataFrame:
1354
  out = df
1355
+ if bucket and bucket != "(any)":
1356
+ out = out[out["bucket"] == bucket]
1357
  if cluster and cluster != "(any)":
1358
+ # Modified to parse the new cluster label format
1359
+ m = re.match(r"^.*?(\-?\d+)\s+—", cluster)
1360
  if m:
1361
  cid = int(m.group(1))
1362
  out = out[out["cluster_id"] == cid]
 
1415
  if inbox_file is None:
1416
  return ("**Please upload a file.**",
1417
  None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1418
+ None, None, None, None, None)
1419
 
1420
  # === Vectorization & Clustering (UPGRADED) ===
1421
  def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
 
1601
  if not recs:
1602
  return ("**No valid records found.**",
1603
  None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1604
+ None, None, None, None, None)
1605
 
1606
  normd = []
1607
  for r in tqdm(recs, desc="Normalize", leave=False):
 
1612
  if df.empty:
1613
  return ("**No usable email records.**",
1614
  None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
1615
+ None, None, None, None, None)
1616
 
1617
  df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
1618
  df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
1619
  df = compute_sentiment_column(df)
1620
 
1621
+ # >>> NEW: stage-1 routing
1622
+ df["bucket"] = df.apply(route_email_row, axis=1)
1623
+ df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_domain", "")), axis=1)
1624
+ df["is_notify"] = df.apply(lambda r: is_notification_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_email", ""), r.get("from_domain", "")), axis=1)
1625
+ df.loc[df["is_news"] == True, "bucket"] = "Newsletters/Alerts"
1626
+ df.loc[df["is_notify"] == True, "bucket"] = "IT/Security"
1627
+
1628
  flags = []
1629
  for _, row in df.iterrows():
1630
  f = []
 
1638
  flags.append(f)
1639
  df["flags"] = flags
1640
 
1641
+ df_main = df[~df["bucket"].isin(["Newsletters/Alerts", "IT/Security"])].reset_index(drop=True)
1642
+ df_news = df[df["bucket"] == "Newsletters/Alerts"].reset_index(drop=True)
1643
+ df_alerts = df[df["bucket"] == "IT/Security"].reset_index(drop=True)
 
 
 
1644
 
1645
  kv = None
1646
  emb_dim = 0
 
1648
  kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
1649
 
1650
  parts = []
1651
+ if bool(per_language):
1652
+ for bkt, g_bucket in df_main.groupby("bucket", dropna=False):
1653
+ for lang_code, grp in g_bucket.groupby("lang", dropna=False):
1654
+ parts.append(((bkt, lang_code), grp.copy()))
1655
  else:
1656
+ for bkt, grp in df_main.groupby("bucket", dropna=False):
1657
+ parts.append(((bkt, "all"), grp.copy()))
1658
 
1659
  labels_list, cluster_name_list, anomaly_list = [], [], []
1660
+ bucket_indexers = [] # keep index locations to reassign later
1661
  X_reduced_holder = None
1662
  term_names_global = {}
1663
  single_partition = (len(parts) == 1)
1664
  d_word_agg, d_char_agg, k_agg = 0, 0, 0
1665
  svd_obj_local, norm_obj_local = None, None
1666
 
1667
+ for (bucket_name, _lang), df_part in parts:
1668
  if df_part.empty:
1669
  continue
1670
  texts, subjects_only = _make_texts(df_part)
 
1713
  d_word=d_word,
1714
  d_char=d_char,
1715
  )
1716
+ labels = stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35)
 
 
 
 
 
 
 
1717
  k_agg += len(set(labels))
1718
 
1719
  term_names = cluster_labels_pmi_bigram(
1720
  texts=texts, labels=labels, subjects=subjects_only,
1721
  topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
1722
  )
 
1723
 
1724
+ # collect to write back in one go
1725
+ bucket_indexers.append(df_part.index)
1726
  labels_list.append(pd.Series(labels, index=df_part.index))
1727
+ cluster_name_list.append(pd.Series([term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels], index=df_part.index))
 
 
 
 
 
1728
  anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
1729
+ # remember term names for the reader pill
1730
+ term_names_global.update({int(k): v for k, v in term_names.items()})
1731
 
1732
  if single_partition and X_reduced is not None:
1733
  X_reduced_holder = X_reduced
1734
+
1735
  if labels_list:
1736
+ df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_id"] = pd.concat(labels_list).sort_index()
1737
+ df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_name"] = pd.concat(cluster_name_list).sort_index()
1738
+ df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "anomaly_score"] = pd.concat(anomaly_list).sort_index()
1739
  else:
1740
  df_main["cluster_id"] = -10
1741
  df_main["cluster_name"] = "unclustered"
 
1752
 
1753
  df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
1754
  df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
1755
+ df = compute_context_anomaly(df)
1756
 
1757
  index_obj = None
1758
  use_faiss_flag = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
 
1771
  pass
1772
 
1773
  cluster_counts = (
1774
+ df.groupby(["bucket", "cluster_id", "cluster_name"])
1775
+ .size()
1776
+ .reset_index(name="count")
1777
+ .sort_values("count", ascending=False)
1778
+ .head(500)
1779
  )
1780
  cluster_counts["label"] = cluster_counts.apply(
1781
+ lambda r: f'{r["bucket"]} — {int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
1782
  )
1783
  cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
1784
+ bucket_choices = ["(any)"] + sorted(df["bucket"].dropna().unique().tolist())
1785
+
1786
  domain_counts = (
1787
  df.groupby("from_domain").size().reset_index(name="count").sort_values("count", ascending=False).head(100)
1788
  )
 
1800
  .sort_values("corruption_score", ascending=False)
1801
  .head(200)
1802
  )
1803
+ out_table = _sort_results(df, "context_anomaly_score", "desc")
1804
 
1805
  vec_state = {
1806
  "use_hashing": bool(use_hashing),
 
1816
  norm_obj_out = norm_obj_local if single_partition else None
1817
 
1818
  return (
1819
+ status_md, cluster_counts, domain_counts, sender_counts, actors, offhours_table,
1820
+ out_table, df, vec_state, X_reduced_holder, index_obj, term_names_global,
1821
+ bool(use_lsa), use_faiss_flag,
 
 
 
 
 
 
 
 
 
 
 
1822
  gr.update(choices=cluster_choices, value="(any)"),
1823
+ gr.update(choices=domain_choices, value="(any)"),
1824
+ gr.update(choices=sender_choices, value="(any)"),
1825
  gr.update(choices=lang_choices, value="(any)"),
1826
+ svd_obj_out, norm_obj_out, (d_word_agg, d_char_agg),
1827
+ extra_terms_lower, bool(highlight_toggle),
1828
+ gr.update(choices=bucket_choices, value="(any)")
 
 
1829
  )
1830
 
1831
  (run_btn.click)(
1832
  process_file,
1833
  inputs=[
1834
+ inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
1835
+ use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
1836
+ trusted_domains_in, extra_keywords_in, highlight_toggle,
1837
+ use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
1838
+ per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1839
  ],
1840
  outputs=[
1841
+ status, cluster_counts_df, domain_counts_df, sender_counts_df,
1842
+ actors_df, offhours_df, results_df,
1843
+ state_df, state_vec, state_X_reduced, state_index, state_term_names,
1844
+ state_use_lsa, state_use_faiss,
1845
+ cluster_drop, domain_drop, sender_drop, lang_drop,
1846
+ state_svd, state_norm, state_dims, state_extra_terms, state_highlight,
1847
+ bucket_drop,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1848
  ],
1849
  )
1850
 
 
1857
  tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
1858
  else:
1859
  tmp["_dt"] = pd.NaT
1860
+ by = by if by in tmp.columns else "context_anomaly_score"
1861
  asc = (direction == "asc")
1862
  sort_cols = [by]
1863
  if by == "date":
1864
  sort_cols = ["_dt"]
1865
+ elif by in ["anomaly_score", "corruption_score", "context_anomaly_score"]:
1866
  sort_cols.append("_dt")
1867
 
1868
  tmp = tmp.sort_values(sort_cols, ascending=[asc, False])
1869
 
1870
  cols_out = [
1871
+ "date","bucket","from_email","from_domain","subject","cluster_name","lang",
1872
+ "tags","flags","sentiment","context_anomaly_score","corruption_score","anomaly_score"
 
 
 
 
 
 
 
 
 
1873
  ]
1874
  if "search_score" in tmp.columns:
1875
  cols_out.append("search_score")
1876
 
1877
  return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
1878
 
1879
+ def refresh_results(df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
1880
  if df is None or len(df) == 0:
1881
  return pd.DataFrame()
1882
  filt = _apply_filters(
1883
+ df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, hide_noise_flag=bool(hide_noise_flag)
1884
  )
1885
  return _sort_results(filt, sort_by, sort_dir)
1886
 
1887
  # Re-run when any filter control changes (including hide_noise)
1888
+ for ctrl in [bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
1889
  date_start, date_end, sort_by, sort_dir, hide_noise]:
1890
  ctrl.change(
1891
  refresh_results,
1892
  inputs=[
1893
+ state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
1894
+ sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
 
 
 
 
 
 
 
 
 
 
1895
  ],
1896
  outputs=[results_df],
1897
  )
1898
 
1899
+ # Reset filters
1900
  reset_btn.click(
1901
+ lambda: ["(any)"] * 7 + [""] * 2 + ["context_anomaly_score", "desc"] + [True],
1902
  [],
1903
+ [bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
1904
  date_start, date_end, sort_by, sort_dir, hide_noise],
1905
  ).then(
1906
  refresh_results,
1907
  inputs=[
1908
+ state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
1909
+ sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise,
 
 
 
 
 
 
 
 
 
 
1910
  ],
1911
  outputs=[results_df],
1912
  )
 
2003
  search_btn.click(
2004
  search_fn,
2005
  inputs=[
2006
+ search_query, state_df, state_vec, state_X_reduced, state_index,
2007
+ state_use_lsa, state_use_faiss, state_svd, state_norm, sort_by, sort_dir,
 
 
 
 
 
 
 
 
 
2008
  ],
2009
  outputs=[results_df, state_query_terms],
2010
  )
 
2033
  return build_highlighted_html(
2034
  row,
2035
  query_terms=q_terms,
2036
+ cluster_label=f'{row.get("bucket","Other")} / {clabel}',
2037
  do_highlight=do_highlight,
2038
  extra_terms=extra_terms,
2039
  )
 
2050
  return gr.update()
2051
  val = df_sum.iloc[evt.index[0]][col_name]
2052
  return gr.update(value=val)
2053
+
2054
+ def on_cluster_summary_select(evt: gr.SelectData, df_sum: pd.DataFrame):
2055
+ if evt.index is None or df_sum is None or df_sum.empty:
2056
+ return gr.update(), gr.update()
2057
+ r = df_sum.iloc[evt.index[0]]
2058
+ return gr.update(value=r["bucket"]), gr.update(value=r["label"])
2059
 
2060
+ # Cluster summary → set bucket & cluster filter
2061
  cluster_counts_df.select(
2062
+ on_cluster_summary_select, [cluster_counts_df], [bucket_drop, cluster_drop]
2063
  ).then(
2064
  refresh_results,
2065
  inputs=[
2066
+ state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
2067
+ sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
 
 
 
 
 
 
 
 
 
 
2068
  ],
2069
  outputs=[results_df],
2070
  )
 
2075
  ).then(
2076
  refresh_results,
2077
  inputs=[
2078
+ state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
2079
+ sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
 
 
 
 
 
 
 
 
 
 
2080
  ],
2081
  outputs=[results_df],
2082
  )
 
2087
  ).then(
2088
  refresh_results,
2089
  inputs=[
2090
+ state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
2091
+ sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
 
 
 
 
 
 
 
 
 
 
2092
  ],
2093
  outputs=[results_df],
2094
  )