Update app.py
Browse files
app.py
CHANGED
|
@@ -64,6 +64,74 @@ try:
|
|
| 64 |
except Exception:
|
| 65 |
VADER_OK = False
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
# =================== Regex & Flags ===================
|
| 68 |
TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
|
| 69 |
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
|
|
@@ -1082,6 +1150,50 @@ def corruption_score(row, trusted_domains: set):
|
|
| 1082 |
score += 0.3
|
| 1083 |
return score
|
| 1084 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1085 |
# =================== Gradio UI ===================
|
| 1086 |
CSS = """
|
| 1087 |
:root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
|
|
@@ -1145,6 +1257,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1145 |
embeddings_path = gr.Textbox(label="Path to local embeddings (.txt/.vec/.bin) (optional)", value="")
|
| 1146 |
embeddings_binary = gr.Checkbox(label="File is binary word2vec format", value=False)
|
| 1147 |
with gr.Row():
|
|
|
|
| 1148 |
cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
|
| 1149 |
domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
|
| 1150 |
sender_drop = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
|
|
@@ -1154,7 +1267,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1154 |
with gr.Row():
|
| 1155 |
date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
|
| 1156 |
date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
|
| 1157 |
-
sort_by = gr.Dropdown(label="Sort by", choices=["corruption_score","date","anomaly_score","search_score"], value="
|
| 1158 |
sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
|
| 1159 |
# NEW: hide noise toggle
|
| 1160 |
hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
|
|
@@ -1227,6 +1340,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1227 |
|
| 1228 |
def _apply_filters(
|
| 1229 |
df: pd.DataFrame,
|
|
|
|
| 1230 |
cluster: Optional[str],
|
| 1231 |
domain: Optional[str],
|
| 1232 |
sender: Optional[str],
|
|
@@ -1235,11 +1349,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1235 |
tag_value: str,
|
| 1236 |
start: str,
|
| 1237 |
end: str,
|
| 1238 |
-
hide_noise_flag: bool = False,
|
| 1239 |
) -> pd.DataFrame:
|
| 1240 |
out = df
|
|
|
|
|
|
|
| 1241 |
if cluster and cluster != "(any)":
|
| 1242 |
-
|
|
|
|
| 1243 |
if m:
|
| 1244 |
cid = int(m.group(1))
|
| 1245 |
out = out[out["cluster_id"] == cid]
|
|
@@ -1298,7 +1415,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1298 |
if inbox_file is None:
|
| 1299 |
return ("**Please upload a file.**",
|
| 1300 |
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 1301 |
-
None, None, None, None)
|
| 1302 |
|
| 1303 |
# === Vectorization & Clustering (UPGRADED) ===
|
| 1304 |
def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
|
|
@@ -1484,7 +1601,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1484 |
if not recs:
|
| 1485 |
return ("**No valid records found.**",
|
| 1486 |
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 1487 |
-
None, None, None, None)
|
| 1488 |
|
| 1489 |
normd = []
|
| 1490 |
for r in tqdm(recs, desc="Normalize", leave=False):
|
|
@@ -1495,12 +1612,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1495 |
if df.empty:
|
| 1496 |
return ("**No usable email records.**",
|
| 1497 |
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 1498 |
-
None, None, None, None)
|
| 1499 |
|
| 1500 |
df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
|
| 1501 |
df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
|
| 1502 |
df = compute_sentiment_column(df)
|
| 1503 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1504 |
flags = []
|
| 1505 |
for _, row in df.iterrows():
|
| 1506 |
f = []
|
|
@@ -1514,12 +1638,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1514 |
flags.append(f)
|
| 1515 |
df["flags"] = flags
|
| 1516 |
|
| 1517 |
-
|
| 1518 |
-
df["
|
| 1519 |
-
|
| 1520 |
-
df_main = df[~(df["is_news"] | df["is_notify"])].reset_index(drop=True)
|
| 1521 |
-
df_news = df[df["is_news"]].reset_index(drop=True)
|
| 1522 |
-
df_alerts = df[df["is_notify"]].reset_index(drop=True)
|
| 1523 |
|
| 1524 |
kv = None
|
| 1525 |
emb_dim = 0
|
|
@@ -1527,20 +1648,23 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1527 |
kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
|
| 1528 |
|
| 1529 |
parts = []
|
| 1530 |
-
if bool(per_language)
|
| 1531 |
-
for
|
| 1532 |
-
|
|
|
|
| 1533 |
else:
|
| 1534 |
-
|
|
|
|
| 1535 |
|
| 1536 |
labels_list, cluster_name_list, anomaly_list = [], [], []
|
|
|
|
| 1537 |
X_reduced_holder = None
|
| 1538 |
term_names_global = {}
|
| 1539 |
single_partition = (len(parts) == 1)
|
| 1540 |
d_word_agg, d_char_agg, k_agg = 0, 0, 0
|
| 1541 |
svd_obj_local, norm_obj_local = None, None
|
| 1542 |
|
| 1543 |
-
for
|
| 1544 |
if df_part.empty:
|
| 1545 |
continue
|
| 1546 |
texts, subjects_only = _make_texts(df_part)
|
|
@@ -1589,38 +1713,29 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1589 |
d_word=d_word,
|
| 1590 |
d_char=d_char,
|
| 1591 |
)
|
| 1592 |
-
|
| 1593 |
-
labels = stabilize_labels(
|
| 1594 |
-
X_space, labels,
|
| 1595 |
-
min_size=40,
|
| 1596 |
-
merge_thresh=0.96,
|
| 1597 |
-
reassign_thresh=0.35,
|
| 1598 |
-
)
|
| 1599 |
-
|
| 1600 |
k_agg += len(set(labels))
|
| 1601 |
|
| 1602 |
term_names = cluster_labels_pmi_bigram(
|
| 1603 |
texts=texts, labels=labels, subjects=subjects_only,
|
| 1604 |
topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
|
| 1605 |
)
|
| 1606 |
-
term_names_global.update({int(k): v for k, v in term_names.items()})
|
| 1607 |
|
|
|
|
|
|
|
| 1608 |
labels_list.append(pd.Series(labels, index=df_part.index))
|
| 1609 |
-
cluster_name_list.append(
|
| 1610 |
-
pd.Series(
|
| 1611 |
-
[term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels],
|
| 1612 |
-
index=df_part.index,
|
| 1613 |
-
)
|
| 1614 |
-
)
|
| 1615 |
anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
|
|
|
|
|
|
|
| 1616 |
|
| 1617 |
if single_partition and X_reduced is not None:
|
| 1618 |
X_reduced_holder = X_reduced
|
| 1619 |
-
|
| 1620 |
if labels_list:
|
| 1621 |
-
df_main["cluster_id"]
|
| 1622 |
-
df_main["cluster_name"] = pd.concat(cluster_name_list).sort_index()
|
| 1623 |
-
df_main["anomaly_score"] = pd.concat(anomaly_list).sort_index()
|
| 1624 |
else:
|
| 1625 |
df_main["cluster_id"] = -10
|
| 1626 |
df_main["cluster_name"] = "unclustered"
|
|
@@ -1637,6 +1752,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1637 |
|
| 1638 |
df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
|
| 1639 |
df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
|
|
|
|
| 1640 |
|
| 1641 |
index_obj = None
|
| 1642 |
use_faiss_flag = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
|
|
@@ -1655,16 +1771,18 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1655 |
pass
|
| 1656 |
|
| 1657 |
cluster_counts = (
|
| 1658 |
-
df.groupby(["cluster_id", "cluster_name"])
|
| 1659 |
-
|
| 1660 |
-
|
| 1661 |
-
|
| 1662 |
-
|
| 1663 |
)
|
| 1664 |
cluster_counts["label"] = cluster_counts.apply(
|
| 1665 |
-
lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
|
| 1666 |
)
|
| 1667 |
cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
|
|
|
|
|
|
|
| 1668 |
domain_counts = (
|
| 1669 |
df.groupby("from_domain").size().reset_index(name="count").sort_values("count", ascending=False).head(100)
|
| 1670 |
)
|
|
@@ -1682,7 +1800,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1682 |
.sort_values("corruption_score", ascending=False)
|
| 1683 |
.head(200)
|
| 1684 |
)
|
| 1685 |
-
out_table = _sort_results(df, "
|
| 1686 |
|
| 1687 |
vec_state = {
|
| 1688 |
"use_hashing": bool(use_hashing),
|
|
@@ -1698,85 +1816,35 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1698 |
norm_obj_out = norm_obj_local if single_partition else None
|
| 1699 |
|
| 1700 |
return (
|
| 1701 |
-
status_md,
|
| 1702 |
-
|
| 1703 |
-
|
| 1704 |
-
sender_counts,
|
| 1705 |
-
actors,
|
| 1706 |
-
offhours_table,
|
| 1707 |
-
out_table,
|
| 1708 |
-
df,
|
| 1709 |
-
vec_state,
|
| 1710 |
-
X_reduced_holder,
|
| 1711 |
-
index_obj,
|
| 1712 |
-
term_names_global,
|
| 1713 |
-
bool(use_lsa),
|
| 1714 |
-
use_faiss_flag,
|
| 1715 |
gr.update(choices=cluster_choices, value="(any)"),
|
| 1716 |
-
gr.update(choices=domain_choices,
|
| 1717 |
-
gr.update(choices=sender_choices,
|
| 1718 |
gr.update(choices=lang_choices, value="(any)"),
|
| 1719 |
-
svd_obj_out,
|
| 1720 |
-
|
| 1721 |
-
(
|
| 1722 |
-
extra_terms_lower,
|
| 1723 |
-
bool(highlight_toggle),
|
| 1724 |
)
|
| 1725 |
|
| 1726 |
(run_btn.click)(
|
| 1727 |
process_file,
|
| 1728 |
inputs=[
|
| 1729 |
-
inbox_file,
|
| 1730 |
-
|
| 1731 |
-
|
| 1732 |
-
|
| 1733 |
-
|
| 1734 |
-
skip_lang,
|
| 1735 |
-
use_lsa,
|
| 1736 |
-
lsa_dim,
|
| 1737 |
-
auto_k,
|
| 1738 |
-
k_clusters,
|
| 1739 |
-
mb_batch,
|
| 1740 |
-
use_faiss,
|
| 1741 |
-
use_iso,
|
| 1742 |
-
trusted_domains_in,
|
| 1743 |
-
extra_keywords_in,
|
| 1744 |
-
highlight_toggle,
|
| 1745 |
-
use_hashing,
|
| 1746 |
-
hash_bits,
|
| 1747 |
-
use_hdbscan,
|
| 1748 |
-
hdb_min_cluster,
|
| 1749 |
-
hdb_min_samples,
|
| 1750 |
-
per_language,
|
| 1751 |
-
use_embeddings,
|
| 1752 |
-
embed_weight,
|
| 1753 |
-
embeddings_path,
|
| 1754 |
-
embeddings_binary,
|
| 1755 |
],
|
| 1756 |
outputs=[
|
| 1757 |
-
status,
|
| 1758 |
-
|
| 1759 |
-
|
| 1760 |
-
|
| 1761 |
-
|
| 1762 |
-
|
| 1763 |
-
|
| 1764 |
-
state_df,
|
| 1765 |
-
state_vec,
|
| 1766 |
-
state_X_reduced,
|
| 1767 |
-
state_index,
|
| 1768 |
-
state_term_names,
|
| 1769 |
-
state_use_lsa,
|
| 1770 |
-
state_use_faiss,
|
| 1771 |
-
cluster_drop,
|
| 1772 |
-
domain_drop,
|
| 1773 |
-
sender_drop,
|
| 1774 |
-
lang_drop,
|
| 1775 |
-
state_svd,
|
| 1776 |
-
state_norm,
|
| 1777 |
-
state_dims,
|
| 1778 |
-
state_extra_terms,
|
| 1779 |
-
state_highlight,
|
| 1780 |
],
|
| 1781 |
)
|
| 1782 |
|
|
@@ -1789,85 +1857,56 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1789 |
tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
|
| 1790 |
else:
|
| 1791 |
tmp["_dt"] = pd.NaT
|
| 1792 |
-
by = by if by in tmp.columns else "
|
| 1793 |
asc = (direction == "asc")
|
| 1794 |
sort_cols = [by]
|
| 1795 |
if by == "date":
|
| 1796 |
sort_cols = ["_dt"]
|
| 1797 |
-
elif by in ["anomaly_score", "corruption_score"]:
|
| 1798 |
sort_cols.append("_dt")
|
| 1799 |
|
| 1800 |
tmp = tmp.sort_values(sort_cols, ascending=[asc, False])
|
| 1801 |
|
| 1802 |
cols_out = [
|
| 1803 |
-
"date",
|
| 1804 |
-
"
|
| 1805 |
-
"from_domain",
|
| 1806 |
-
"subject",
|
| 1807 |
-
"cluster_name",
|
| 1808 |
-
"lang",
|
| 1809 |
-
"tags",
|
| 1810 |
-
"flags",
|
| 1811 |
-
"sentiment",
|
| 1812 |
-
"corruption_score",
|
| 1813 |
-
"anomaly_score",
|
| 1814 |
]
|
| 1815 |
if "search_score" in tmp.columns:
|
| 1816 |
cols_out.append("search_score")
|
| 1817 |
|
| 1818 |
return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
|
| 1819 |
|
| 1820 |
-
def refresh_results(df, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
|
| 1821 |
if df is None or len(df) == 0:
|
| 1822 |
return pd.DataFrame()
|
| 1823 |
filt = _apply_filters(
|
| 1824 |
-
df, cluster, domain, sender, lang, sentiment, tag, start, end, hide_noise_flag=bool(hide_noise_flag)
|
| 1825 |
)
|
| 1826 |
return _sort_results(filt, sort_by, sort_dir)
|
| 1827 |
|
| 1828 |
# Re-run when any filter control changes (including hide_noise)
|
| 1829 |
-
for ctrl in [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
|
| 1830 |
date_start, date_end, sort_by, sort_dir, hide_noise]:
|
| 1831 |
ctrl.change(
|
| 1832 |
refresh_results,
|
| 1833 |
inputs=[
|
| 1834 |
-
state_df,
|
| 1835 |
-
|
| 1836 |
-
domain_drop,
|
| 1837 |
-
sender_drop,
|
| 1838 |
-
lang_drop,
|
| 1839 |
-
sentiment_drop,
|
| 1840 |
-
tag_drop,
|
| 1841 |
-
date_start,
|
| 1842 |
-
date_end,
|
| 1843 |
-
sort_by,
|
| 1844 |
-
sort_dir,
|
| 1845 |
-
hide_noise,
|
| 1846 |
],
|
| 1847 |
outputs=[results_df],
|
| 1848 |
)
|
| 1849 |
|
| 1850 |
-
# Reset filters
|
| 1851 |
reset_btn.click(
|
| 1852 |
-
lambda: ["(any)"] *
|
| 1853 |
[],
|
| 1854 |
-
[cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
|
| 1855 |
date_start, date_end, sort_by, sort_dir, hide_noise],
|
| 1856 |
).then(
|
| 1857 |
refresh_results,
|
| 1858 |
inputs=[
|
| 1859 |
-
state_df,
|
| 1860 |
-
|
| 1861 |
-
domain_drop,
|
| 1862 |
-
sender_drop,
|
| 1863 |
-
lang_drop,
|
| 1864 |
-
sentiment_drop,
|
| 1865 |
-
tag_drop,
|
| 1866 |
-
date_start,
|
| 1867 |
-
date_end,
|
| 1868 |
-
sort_by,
|
| 1869 |
-
sort_dir,
|
| 1870 |
-
hide_noise,
|
| 1871 |
],
|
| 1872 |
outputs=[results_df],
|
| 1873 |
)
|
|
@@ -1964,17 +2003,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1964 |
search_btn.click(
|
| 1965 |
search_fn,
|
| 1966 |
inputs=[
|
| 1967 |
-
search_query,
|
| 1968 |
-
|
| 1969 |
-
state_vec,
|
| 1970 |
-
state_X_reduced,
|
| 1971 |
-
state_index,
|
| 1972 |
-
state_use_lsa,
|
| 1973 |
-
state_use_faiss,
|
| 1974 |
-
state_svd,
|
| 1975 |
-
state_norm,
|
| 1976 |
-
sort_by,
|
| 1977 |
-
sort_dir,
|
| 1978 |
],
|
| 1979 |
outputs=[results_df, state_query_terms],
|
| 1980 |
)
|
|
@@ -2003,7 +2033,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 2003 |
return build_highlighted_html(
|
| 2004 |
row,
|
| 2005 |
query_terms=q_terms,
|
| 2006 |
-
cluster_label=clabel,
|
| 2007 |
do_highlight=do_highlight,
|
| 2008 |
extra_terms=extra_terms,
|
| 2009 |
)
|
|
@@ -2020,25 +2050,21 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 2020 |
return gr.update()
|
| 2021 |
val = df_sum.iloc[evt.index[0]][col_name]
|
| 2022 |
return gr.update(value=val)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2023 |
|
| 2024 |
-
# Cluster summary → set cluster filter
|
| 2025 |
cluster_counts_df.select(
|
| 2026 |
-
|
| 2027 |
).then(
|
| 2028 |
refresh_results,
|
| 2029 |
inputs=[
|
| 2030 |
-
state_df,
|
| 2031 |
-
|
| 2032 |
-
domain_drop,
|
| 2033 |
-
sender_drop,
|
| 2034 |
-
lang_drop,
|
| 2035 |
-
sentiment_drop,
|
| 2036 |
-
tag_drop,
|
| 2037 |
-
date_start,
|
| 2038 |
-
date_end,
|
| 2039 |
-
sort_by,
|
| 2040 |
-
sort_dir,
|
| 2041 |
-
hide_noise,
|
| 2042 |
],
|
| 2043 |
outputs=[results_df],
|
| 2044 |
)
|
|
@@ -2049,18 +2075,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 2049 |
).then(
|
| 2050 |
refresh_results,
|
| 2051 |
inputs=[
|
| 2052 |
-
state_df,
|
| 2053 |
-
|
| 2054 |
-
domain_drop,
|
| 2055 |
-
sender_drop,
|
| 2056 |
-
lang_drop,
|
| 2057 |
-
sentiment_drop,
|
| 2058 |
-
tag_drop,
|
| 2059 |
-
date_start,
|
| 2060 |
-
date_end,
|
| 2061 |
-
sort_by,
|
| 2062 |
-
sort_dir,
|
| 2063 |
-
hide_noise,
|
| 2064 |
],
|
| 2065 |
outputs=[results_df],
|
| 2066 |
)
|
|
@@ -2071,18 +2087,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 2071 |
).then(
|
| 2072 |
refresh_results,
|
| 2073 |
inputs=[
|
| 2074 |
-
state_df,
|
| 2075 |
-
|
| 2076 |
-
domain_drop,
|
| 2077 |
-
sender_drop,
|
| 2078 |
-
lang_drop,
|
| 2079 |
-
sentiment_drop,
|
| 2080 |
-
tag_drop,
|
| 2081 |
-
date_start,
|
| 2082 |
-
date_end,
|
| 2083 |
-
sort_by,
|
| 2084 |
-
sort_dir,
|
| 2085 |
-
hide_noise,
|
| 2086 |
],
|
| 2087 |
outputs=[results_df],
|
| 2088 |
)
|
|
|
|
| 64 |
except Exception:
|
| 65 |
VADER_OK = False
|
| 66 |
|
| 67 |
+
# ======== STAGE-1 TAXONOMY (Buckets) ========
|
| 68 |
+
TAXONOMY = {
|
| 69 |
+
"Lobbyist": ["lobby","lobbyist","pac","influence"],
|
| 70 |
+
"Campaign Finance": ["donation","contribution","fundraiser","pledge","campaign finance","pac"],
|
| 71 |
+
"Procurement": ["contract","tender","rfp","rfq","bid","invoice","vendor","purchase order","po"],
|
| 72 |
+
"HR/Admin": ["hiring","personnel","payroll","benefits","policy","vacation","pto"],
|
| 73 |
+
"Constituent": ["constituent","concerned citizen","my issue","complaint","community"],
|
| 74 |
+
"Scheduling": ["schedule","meeting","appointment","calendar","invite","availability","reschedule"],
|
| 75 |
+
"Legal": ["legal","lawsuit","attorney","counsel","privileged","court","subpoena","confidential"],
|
| 76 |
+
"IT/Security": ["password","account security","two-factor","2fa","vpn","verification code","security alert","it support"],
|
| 77 |
+
"Newsletters/Alerts": ["newsletter","daily briefing","news update","unsubscribe","press clip","digest"],
|
| 78 |
+
"Other": [],
|
| 79 |
+
}
|
| 80 |
+
# header/domain cues (expand as you learn)
|
| 81 |
+
LOBBY_DOMAINS = set() # e.g., {"acme-lobby.com"}
|
| 82 |
+
LEGAL_DOMAINS = set() # e.g., {"biglaw.com","firmlaw.com"}
|
| 83 |
+
|
| 84 |
+
def _contains_any(text: str, terms: list[str]) -> bool:
|
| 85 |
+
if not text or not terms: return False
|
| 86 |
+
tl = text.lower()
|
| 87 |
+
return any(t for t in terms if t and t.lower() in tl)
|
| 88 |
+
|
| 89 |
+
def _bucket_header_bonus(row: pd.Series, bucket: str) -> float:
|
| 90 |
+
fd = (row.get("from_domain") or "").lower()
|
| 91 |
+
subj = (row.get("subject") or "")
|
| 92 |
+
if bucket == "Newsletters/Alerts":
|
| 93 |
+
return 5.0 if is_news_like(subj, row.get("body_text",""), fd) else 0.0
|
| 94 |
+
if bucket == "IT/Security":
|
| 95 |
+
return 5.0 if is_notification_like(subj, row.get("body_text",""), row.get("from_email",""), fd) else 0.0
|
| 96 |
+
if bucket == "Constituent":
|
| 97 |
+
# personal mail to public office is a strong hint
|
| 98 |
+
return 3.0 if (fd in PERSONAL_DOMAINS) else 0.0
|
| 99 |
+
if bucket == "Lobbyist":
|
| 100 |
+
return 5.0 if fd in LOBBY_DOMAINS else 0.0
|
| 101 |
+
if bucket == "Legal":
|
| 102 |
+
return 5.0 if (("law" in fd) or (fd in LEGAL_DOMAINS) or ("privileged" in subj.lower())) else 0.0
|
| 103 |
+
if bucket == "Scheduling":
|
| 104 |
+
# ICS invite or explicit invite subject
|
| 105 |
+
body = (row.get("body_text") or "")
|
| 106 |
+
return 3.0 if (ATTACH_NAME_RE.search(" ".join(row.get("attachments") or [])) or re.search(r"\binvitation\b|\binvite\b", subj, re.I) or re.search(r"\.ics\b", body, re.I)) else 0.0
|
| 107 |
+
return 0.0
|
| 108 |
+
|
| 109 |
+
MIN_ROUTE_SCORE = 1.5 # at least ~2 weak signals or one strong
|
| 110 |
+
TIE_MARGIN = 1.0
|
| 111 |
+
|
| 112 |
+
def route_email_row(row: pd.Series) -> str:
|
| 113 |
+
text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
|
| 114 |
+
scores: dict[str,float] = {b: 0.0 for b in TAXONOMY.keys()}
|
| 115 |
+
# lexicon points
|
| 116 |
+
for b, terms in TAXONOMY.items():
|
| 117 |
+
if not terms:
|
| 118 |
+
continue
|
| 119 |
+
# count unique term hits to avoid over-crediting repeats
|
| 120 |
+
hits = sum(1 for t in terms if t and t.lower() in text)
|
| 121 |
+
scores[b] += float(hits)
|
| 122 |
+
# strong phrases in your corruption lexicon can hint Lobbyist/Procurement
|
| 123 |
+
if b in ("Lobbyist","Procurement") and any(p in text for p in SUSPECT_PHRASES):
|
| 124 |
+
scores[b] += 1.0
|
| 125 |
+
# header bonuses
|
| 126 |
+
for b in TAXONOMY.keys():
|
| 127 |
+
scores[b] += _bucket_header_bonus(row, b)
|
| 128 |
+
# choose
|
| 129 |
+
best_bucket, best = max(scores.items(), key=lambda kv: kv[1])
|
| 130 |
+
second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
|
| 131 |
+
if best < MIN_ROUTE_SCORE or (best - second) < TIE_MARGIN:
|
| 132 |
+
return "Other"
|
| 133 |
+
return best_bucket
|
| 134 |
+
|
| 135 |
# =================== Regex & Flags ===================
|
| 136 |
TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
|
| 137 |
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
|
|
|
|
| 1150 |
score += 0.3
|
| 1151 |
return score
|
| 1152 |
|
| 1153 |
+
def compute_context_anomaly(df_in: pd.DataFrame) -> pd.DataFrame:
|
| 1154 |
+
if df_in.empty:
|
| 1155 |
+
df_in["context_anomaly_score"] = 0.0
|
| 1156 |
+
return df_in
|
| 1157 |
+
|
| 1158 |
+
# 1) IsolationForest percentile -> 0–6 (you already computed anomaly_score per partition; lower is “more anomalous” if using score_samples with sign reversed above)
|
| 1159 |
+
df = df_in.copy()
|
| 1160 |
+
if "anomaly_score" in df.columns:
|
| 1161 |
+
# higher = more anomalous in your current pipeline (you negated score_samples). Convert to percentile per bucket.
|
| 1162 |
+
df["_if_pct"] = 0.0
|
| 1163 |
+
for bkt, grp in df.groupby("bucket", dropna=False):
|
| 1164 |
+
vals = grp["anomaly_score"].astype(float)
|
| 1165 |
+
if vals.notna().sum() >= 5:
|
| 1166 |
+
ranks = vals.rank(pct=True, ascending=False) # top anomaly gets 1.0
|
| 1167 |
+
df.loc[grp.index, "_if_pct"] = ranks.clip(0, 1)
|
| 1168 |
+
df["_if_pts"] = (df["_if_pct"] * 6.0).clip(0, 6)
|
| 1169 |
+
else:
|
| 1170 |
+
df["_if_pts"] = 0.0
|
| 1171 |
+
|
| 1172 |
+
# 2) Rule violations per bucket (0–2)
|
| 1173 |
+
df["_rule_pts"] = 0.0
|
| 1174 |
+
low = (df["subject"].fillna("") + " " + df["body_text"].fillna("")).str.lower()
|
| 1175 |
+
for bkt, terms in TAXONOMY.items():
|
| 1176 |
+
mask = (df["bucket"] == bkt)
|
| 1177 |
+
if not mask.any():
|
| 1178 |
+
continue
|
| 1179 |
+
if terms:
|
| 1180 |
+
has_term = low.str.contains("|".join([re.escape(t.lower()) for t in terms]), regex=True)
|
| 1181 |
+
df.loc[mask & (~has_term), "_rule_pts"] += 1.0
|
| 1182 |
+
# header expectation examples:
|
| 1183 |
+
if bkt == "Constituent":
|
| 1184 |
+
df.loc[mask & (~df["from_domain"].str.lower().isin(PERSONAL_DOMAINS)), "_rule_pts"] += 1.0
|
| 1185 |
+
if bkt == "Scheduling":
|
| 1186 |
+
subj = df.loc[mask, "subject"].fillna("").str.lower()
|
| 1187 |
+
df.loc[mask & (~subj.str.contains(r"\bmeeting|invite|schedule|calendar\b", regex=True)), "_rule_pts"] += 1.0
|
| 1188 |
+
|
| 1189 |
+
df["_rule_pts"] = df["_rule_pts"].clip(0, 2)
|
| 1190 |
+
|
| 1191 |
+
# 3) Corruption heuristics capped to 0–3
|
| 1192 |
+
df["_corr_pts"] = df["corruption_score"].fillna(0).clip(0, 3)
|
| 1193 |
+
|
| 1194 |
+
df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
|
| 1195 |
+
return df.drop(columns=["_if_pct","_if_pts","_rule_pts","_corr_pts"], errors="ignore")
|
| 1196 |
+
|
| 1197 |
# =================== Gradio UI ===================
|
| 1198 |
CSS = """
|
| 1199 |
:root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
|
|
|
|
| 1257 |
embeddings_path = gr.Textbox(label="Path to local embeddings (.txt/.vec/.bin) (optional)", value="")
|
| 1258 |
embeddings_binary = gr.Checkbox(label="File is binary word2vec format", value=False)
|
| 1259 |
with gr.Row():
|
| 1260 |
+
bucket_drop = gr.Dropdown(label="Bucket", choices=["(any)"] + list(TAXONOMY.keys()), value="(any)", allow_custom_value=False)
|
| 1261 |
cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
|
| 1262 |
domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
|
| 1263 |
sender_drop = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
|
|
|
|
| 1267 |
with gr.Row():
|
| 1268 |
date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
|
| 1269 |
date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
|
| 1270 |
+
sort_by = gr.Dropdown(label="Sort by", choices=["context_anomaly_score","corruption_score","date","anomaly_score","search_score"], value="context_anomaly_score")
|
| 1271 |
sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
|
| 1272 |
# NEW: hide noise toggle
|
| 1273 |
hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
|
|
|
|
| 1340 |
|
| 1341 |
def _apply_filters(
|
| 1342 |
df: pd.DataFrame,
|
| 1343 |
+
bucket: Optional[str],
|
| 1344 |
cluster: Optional[str],
|
| 1345 |
domain: Optional[str],
|
| 1346 |
sender: Optional[str],
|
|
|
|
| 1349 |
tag_value: str,
|
| 1350 |
start: str,
|
| 1351 |
end: str,
|
| 1352 |
+
hide_noise_flag: bool = False,
|
| 1353 |
) -> pd.DataFrame:
|
| 1354 |
out = df
|
| 1355 |
+
if bucket and bucket != "(any)":
|
| 1356 |
+
out = out[out["bucket"] == bucket]
|
| 1357 |
if cluster and cluster != "(any)":
|
| 1358 |
+
# Modified to parse the new cluster label format
|
| 1359 |
+
m = re.match(r"^.*?(\-?\d+)\s+—", cluster)
|
| 1360 |
if m:
|
| 1361 |
cid = int(m.group(1))
|
| 1362 |
out = out[out["cluster_id"] == cid]
|
|
|
|
| 1415 |
if inbox_file is None:
|
| 1416 |
return ("**Please upload a file.**",
|
| 1417 |
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 1418 |
+
None, None, None, None, None)
|
| 1419 |
|
| 1420 |
# === Vectorization & Clustering (UPGRADED) ===
|
| 1421 |
def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
|
|
|
|
| 1601 |
if not recs:
|
| 1602 |
return ("**No valid records found.**",
|
| 1603 |
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 1604 |
+
None, None, None, None, None)
|
| 1605 |
|
| 1606 |
normd = []
|
| 1607 |
for r in tqdm(recs, desc="Normalize", leave=False):
|
|
|
|
| 1612 |
if df.empty:
|
| 1613 |
return ("**No usable email records.**",
|
| 1614 |
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 1615 |
+
None, None, None, None, None)
|
| 1616 |
|
| 1617 |
df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
|
| 1618 |
df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
|
| 1619 |
df = compute_sentiment_column(df)
|
| 1620 |
|
| 1621 |
+
# >>> NEW: stage-1 routing
|
| 1622 |
+
df["bucket"] = df.apply(route_email_row, axis=1)
|
| 1623 |
+
df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_domain", "")), axis=1)
|
| 1624 |
+
df["is_notify"] = df.apply(lambda r: is_notification_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_email", ""), r.get("from_domain", "")), axis=1)
|
| 1625 |
+
df.loc[df["is_news"] == True, "bucket"] = "Newsletters/Alerts"
|
| 1626 |
+
df.loc[df["is_notify"] == True, "bucket"] = "IT/Security"
|
| 1627 |
+
|
| 1628 |
flags = []
|
| 1629 |
for _, row in df.iterrows():
|
| 1630 |
f = []
|
|
|
|
| 1638 |
flags.append(f)
|
| 1639 |
df["flags"] = flags
|
| 1640 |
|
| 1641 |
+
df_main = df[~df["bucket"].isin(["Newsletters/Alerts", "IT/Security"])].reset_index(drop=True)
|
| 1642 |
+
df_news = df[df["bucket"] == "Newsletters/Alerts"].reset_index(drop=True)
|
| 1643 |
+
df_alerts = df[df["bucket"] == "IT/Security"].reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
| 1644 |
|
| 1645 |
kv = None
|
| 1646 |
emb_dim = 0
|
|
|
|
| 1648 |
kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
|
| 1649 |
|
| 1650 |
parts = []
|
| 1651 |
+
if bool(per_language):
|
| 1652 |
+
for bkt, g_bucket in df_main.groupby("bucket", dropna=False):
|
| 1653 |
+
for lang_code, grp in g_bucket.groupby("lang", dropna=False):
|
| 1654 |
+
parts.append(((bkt, lang_code), grp.copy()))
|
| 1655 |
else:
|
| 1656 |
+
for bkt, grp in df_main.groupby("bucket", dropna=False):
|
| 1657 |
+
parts.append(((bkt, "all"), grp.copy()))
|
| 1658 |
|
| 1659 |
labels_list, cluster_name_list, anomaly_list = [], [], []
|
| 1660 |
+
bucket_indexers = [] # keep index locations to reassign later
|
| 1661 |
X_reduced_holder = None
|
| 1662 |
term_names_global = {}
|
| 1663 |
single_partition = (len(parts) == 1)
|
| 1664 |
d_word_agg, d_char_agg, k_agg = 0, 0, 0
|
| 1665 |
svd_obj_local, norm_obj_local = None, None
|
| 1666 |
|
| 1667 |
+
for (bucket_name, _lang), df_part in parts:
|
| 1668 |
if df_part.empty:
|
| 1669 |
continue
|
| 1670 |
texts, subjects_only = _make_texts(df_part)
|
|
|
|
| 1713 |
d_word=d_word,
|
| 1714 |
d_char=d_char,
|
| 1715 |
)
|
| 1716 |
+
labels = stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1717 |
k_agg += len(set(labels))
|
| 1718 |
|
| 1719 |
term_names = cluster_labels_pmi_bigram(
|
| 1720 |
texts=texts, labels=labels, subjects=subjects_only,
|
| 1721 |
topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
|
| 1722 |
)
|
|
|
|
| 1723 |
|
| 1724 |
+
# collect to write back in one go
|
| 1725 |
+
bucket_indexers.append(df_part.index)
|
| 1726 |
labels_list.append(pd.Series(labels, index=df_part.index))
|
| 1727 |
+
cluster_name_list.append(pd.Series([term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels], index=df_part.index))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1728 |
anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
|
| 1729 |
+
# remember term names for the reader pill
|
| 1730 |
+
term_names_global.update({int(k): v for k, v in term_names.items()})
|
| 1731 |
|
| 1732 |
if single_partition and X_reduced is not None:
|
| 1733 |
X_reduced_holder = X_reduced
|
| 1734 |
+
|
| 1735 |
if labels_list:
|
| 1736 |
+
df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_id"] = pd.concat(labels_list).sort_index()
|
| 1737 |
+
df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_name"] = pd.concat(cluster_name_list).sort_index()
|
| 1738 |
+
df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "anomaly_score"] = pd.concat(anomaly_list).sort_index()
|
| 1739 |
else:
|
| 1740 |
df_main["cluster_id"] = -10
|
| 1741 |
df_main["cluster_name"] = "unclustered"
|
|
|
|
| 1752 |
|
| 1753 |
df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
|
| 1754 |
df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
|
| 1755 |
+
df = compute_context_anomaly(df)
|
| 1756 |
|
| 1757 |
index_obj = None
|
| 1758 |
use_faiss_flag = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
|
|
|
|
| 1771 |
pass
|
| 1772 |
|
| 1773 |
cluster_counts = (
|
| 1774 |
+
df.groupby(["bucket", "cluster_id", "cluster_name"])
|
| 1775 |
+
.size()
|
| 1776 |
+
.reset_index(name="count")
|
| 1777 |
+
.sort_values("count", ascending=False)
|
| 1778 |
+
.head(500)
|
| 1779 |
)
|
| 1780 |
cluster_counts["label"] = cluster_counts.apply(
|
| 1781 |
+
lambda r: f'{r["bucket"]} — {int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
|
| 1782 |
)
|
| 1783 |
cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
|
| 1784 |
+
bucket_choices = ["(any)"] + sorted(df["bucket"].dropna().unique().tolist())
|
| 1785 |
+
|
| 1786 |
domain_counts = (
|
| 1787 |
df.groupby("from_domain").size().reset_index(name="count").sort_values("count", ascending=False).head(100)
|
| 1788 |
)
|
|
|
|
| 1800 |
.sort_values("corruption_score", ascending=False)
|
| 1801 |
.head(200)
|
| 1802 |
)
|
| 1803 |
+
out_table = _sort_results(df, "context_anomaly_score", "desc")
|
| 1804 |
|
| 1805 |
vec_state = {
|
| 1806 |
"use_hashing": bool(use_hashing),
|
|
|
|
| 1816 |
norm_obj_out = norm_obj_local if single_partition else None
|
| 1817 |
|
| 1818 |
return (
|
| 1819 |
+
status_md, cluster_counts, domain_counts, sender_counts, actors, offhours_table,
|
| 1820 |
+
out_table, df, vec_state, X_reduced_holder, index_obj, term_names_global,
|
| 1821 |
+
bool(use_lsa), use_faiss_flag,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1822 |
gr.update(choices=cluster_choices, value="(any)"),
|
| 1823 |
+
gr.update(choices=domain_choices, value="(any)"),
|
| 1824 |
+
gr.update(choices=sender_choices, value="(any)"),
|
| 1825 |
gr.update(choices=lang_choices, value="(any)"),
|
| 1826 |
+
svd_obj_out, norm_obj_out, (d_word_agg, d_char_agg),
|
| 1827 |
+
extra_terms_lower, bool(highlight_toggle),
|
| 1828 |
+
gr.update(choices=bucket_choices, value="(any)")
|
|
|
|
|
|
|
| 1829 |
)
|
| 1830 |
|
| 1831 |
(run_btn.click)(
|
| 1832 |
process_file,
|
| 1833 |
inputs=[
|
| 1834 |
+
inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 1835 |
+
use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
|
| 1836 |
+
trusted_domains_in, extra_keywords_in, highlight_toggle,
|
| 1837 |
+
use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
|
| 1838 |
+
per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1839 |
],
|
| 1840 |
outputs=[
|
| 1841 |
+
status, cluster_counts_df, domain_counts_df, sender_counts_df,
|
| 1842 |
+
actors_df, offhours_df, results_df,
|
| 1843 |
+
state_df, state_vec, state_X_reduced, state_index, state_term_names,
|
| 1844 |
+
state_use_lsa, state_use_faiss,
|
| 1845 |
+
cluster_drop, domain_drop, sender_drop, lang_drop,
|
| 1846 |
+
state_svd, state_norm, state_dims, state_extra_terms, state_highlight,
|
| 1847 |
+
bucket_drop,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1848 |
],
|
| 1849 |
)
|
| 1850 |
|
|
|
|
| 1857 |
tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
|
| 1858 |
else:
|
| 1859 |
tmp["_dt"] = pd.NaT
|
| 1860 |
+
by = by if by in tmp.columns else "context_anomaly_score"
|
| 1861 |
asc = (direction == "asc")
|
| 1862 |
sort_cols = [by]
|
| 1863 |
if by == "date":
|
| 1864 |
sort_cols = ["_dt"]
|
| 1865 |
+
elif by in ["anomaly_score", "corruption_score", "context_anomaly_score"]:
|
| 1866 |
sort_cols.append("_dt")
|
| 1867 |
|
| 1868 |
tmp = tmp.sort_values(sort_cols, ascending=[asc, False])
|
| 1869 |
|
| 1870 |
cols_out = [
|
| 1871 |
+
"date","bucket","from_email","from_domain","subject","cluster_name","lang",
|
| 1872 |
+
"tags","flags","sentiment","context_anomaly_score","corruption_score","anomaly_score"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1873 |
]
|
| 1874 |
if "search_score" in tmp.columns:
|
| 1875 |
cols_out.append("search_score")
|
| 1876 |
|
| 1877 |
return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
|
| 1878 |
|
| 1879 |
+
def refresh_results(df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
|
| 1880 |
if df is None or len(df) == 0:
|
| 1881 |
return pd.DataFrame()
|
| 1882 |
filt = _apply_filters(
|
| 1883 |
+
df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, hide_noise_flag=bool(hide_noise_flag)
|
| 1884 |
)
|
| 1885 |
return _sort_results(filt, sort_by, sort_dir)
|
| 1886 |
|
| 1887 |
# Re-run when any filter control changes (including hide_noise)
|
| 1888 |
+
for ctrl in [bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
|
| 1889 |
date_start, date_end, sort_by, sort_dir, hide_noise]:
|
| 1890 |
ctrl.change(
|
| 1891 |
refresh_results,
|
| 1892 |
inputs=[
|
| 1893 |
+
state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
|
| 1894 |
+
sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1895 |
],
|
| 1896 |
outputs=[results_df],
|
| 1897 |
)
|
| 1898 |
|
| 1899 |
+
# Reset filters
|
| 1900 |
reset_btn.click(
|
| 1901 |
+
lambda: ["(any)"] * 7 + [""] * 2 + ["context_anomaly_score", "desc"] + [True],
|
| 1902 |
[],
|
| 1903 |
+
[bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
|
| 1904 |
date_start, date_end, sort_by, sort_dir, hide_noise],
|
| 1905 |
).then(
|
| 1906 |
refresh_results,
|
| 1907 |
inputs=[
|
| 1908 |
+
state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
|
| 1909 |
+
sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1910 |
],
|
| 1911 |
outputs=[results_df],
|
| 1912 |
)
|
|
|
|
| 2003 |
search_btn.click(
|
| 2004 |
search_fn,
|
| 2005 |
inputs=[
|
| 2006 |
+
search_query, state_df, state_vec, state_X_reduced, state_index,
|
| 2007 |
+
state_use_lsa, state_use_faiss, state_svd, state_norm, sort_by, sort_dir,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2008 |
],
|
| 2009 |
outputs=[results_df, state_query_terms],
|
| 2010 |
)
|
|
|
|
| 2033 |
return build_highlighted_html(
|
| 2034 |
row,
|
| 2035 |
query_terms=q_terms,
|
| 2036 |
+
cluster_label=f'{row.get("bucket","Other")} / {clabel}',
|
| 2037 |
do_highlight=do_highlight,
|
| 2038 |
extra_terms=extra_terms,
|
| 2039 |
)
|
|
|
|
| 2050 |
return gr.update()
|
| 2051 |
val = df_sum.iloc[evt.index[0]][col_name]
|
| 2052 |
return gr.update(value=val)
|
| 2053 |
+
|
| 2054 |
+
def on_cluster_summary_select(evt: gr.SelectData, df_sum: pd.DataFrame):
|
| 2055 |
+
if evt.index is None or df_sum is None or df_sum.empty:
|
| 2056 |
+
return gr.update(), gr.update()
|
| 2057 |
+
r = df_sum.iloc[evt.index[0]]
|
| 2058 |
+
return gr.update(value=r["bucket"]), gr.update(value=r["label"])
|
| 2059 |
|
| 2060 |
+
# Cluster summary → set bucket & cluster filter
|
| 2061 |
cluster_counts_df.select(
|
| 2062 |
+
on_cluster_summary_select, [cluster_counts_df], [bucket_drop, cluster_drop]
|
| 2063 |
).then(
|
| 2064 |
refresh_results,
|
| 2065 |
inputs=[
|
| 2066 |
+
state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
|
| 2067 |
+
sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2068 |
],
|
| 2069 |
outputs=[results_df],
|
| 2070 |
)
|
|
|
|
| 2075 |
).then(
|
| 2076 |
refresh_results,
|
| 2077 |
inputs=[
|
| 2078 |
+
state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
|
| 2079 |
+
sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2080 |
],
|
| 2081 |
outputs=[results_df],
|
| 2082 |
)
|
|
|
|
| 2087 |
).then(
|
| 2088 |
refresh_results,
|
| 2089 |
inputs=[
|
| 2090 |
+
state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
|
| 2091 |
+
sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2092 |
],
|
| 2093 |
outputs=[results_df],
|
| 2094 |
)
|