Update app.py
Browse files
app.py
CHANGED
|
@@ -81,7 +81,7 @@ TAXONOMY = {
|
|
| 81 |
LOBBY_DOMAINS = set() # e.g., {"acme-lobby.com"}
|
| 82 |
LEGAL_DOMAINS = set() # e.g., {"biglaw.com","firmlaw.com"}
|
| 83 |
|
| 84 |
-
def _contains_any(text: str, terms: list
|
| 85 |
if not text or not terms: return False
|
| 86 |
tl = text.lower()
|
| 87 |
return any(t for t in terms if t and t.lower() in tl)
|
|
@@ -111,7 +111,7 @@ TIE_MARGIN = 1.0
|
|
| 111 |
|
| 112 |
def route_email_row(row: pd.Series) -> str:
|
| 113 |
text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
|
| 114 |
-
scores: dict
|
| 115 |
# lexicon points
|
| 116 |
for b, terms in TAXONOMY.items():
|
| 117 |
if not terms:
|
|
@@ -1194,7 +1194,245 @@ def compute_context_anomaly(df_in: pd.DataFrame) -> pd.DataFrame:
|
|
| 1194 |
df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
|
| 1195 |
return df.drop(columns=["_if_pct","_if_pts","_rule_pts","_corr_pts"], errors="ignore")
|
| 1196 |
|
| 1197 |
-
# ===================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1198 |
CSS = """
|
| 1199 |
:root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
|
| 1200 |
.email-card { background:#ffffff; color:#111827; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.08); }
|
|
@@ -1213,8 +1451,13 @@ hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
|
|
| 1213 |
.cursor { cursor:pointer; }
|
| 1214 |
"""
|
| 1215 |
|
| 1216 |
-
|
|
|
|
| 1217 |
gr.Markdown("# Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1218 |
|
| 1219 |
with gr.Row():
|
| 1220 |
inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
|
|
@@ -1233,7 +1476,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1233 |
use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
|
| 1234 |
lsa_dim = gr.Number(label="LSA components", value=256, precision=0)
|
| 1235 |
auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
|
| 1236 |
-
k_clusters = gr.Number(label="k (
|
| 1237 |
mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
|
| 1238 |
with gr.Row():
|
| 1239 |
use_hdbscan = gr.Checkbox(label="Use HDBSCAN (auto-k, noise) on reduced vectors", value=False)
|
|
@@ -1263,14 +1506,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1263 |
sender_drop = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
|
| 1264 |
lang_drop = gr.Dropdown(label="Language", choices=["(any)"], value="(any)", allow_custom_value=False)
|
| 1265 |
sentiment_drop = gr.Dropdown(label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)")
|
| 1266 |
-
tag_drop = gr.Dropdown(label="Tag", choices=["(any)", "🚩suspect", "finance", "off-channel", "odd-hours", "personal-mail"], value="(any)")
|
| 1267 |
with gr.Row():
|
| 1268 |
date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
|
| 1269 |
date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
|
| 1270 |
sort_by = gr.Dropdown(label="Sort by", choices=["context_anomaly_score","corruption_score","date","anomaly_score","search_score"], value="context_anomaly_score")
|
| 1271 |
sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
|
| 1272 |
-
|
| 1273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1274 |
|
| 1275 |
with gr.Row():
|
| 1276 |
run_btn = gr.Button("Process", variant="primary")
|
|
@@ -1287,6 +1535,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1287 |
actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
|
| 1288 |
offhours_df = gr.Dataframe(label="Off-hours & personal-mail hits", interactive=False, wrap=True)
|
| 1289 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1290 |
gr.Markdown("### Search")
|
| 1291 |
with gr.Row():
|
| 1292 |
search_query = gr.Textbox(label="Search (keywords, names, etc.)")
|
|
@@ -1294,7 +1547,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1294 |
results_df = gr.Dataframe(label="Results (top 500 or top 50 for search)", interactive=True, wrap=True)
|
| 1295 |
email_view = gr.HTML(label="Reader")
|
| 1296 |
|
| 1297 |
-
# State
|
| 1298 |
state_df = gr.State()
|
| 1299 |
state_vec = gr.State()
|
| 1300 |
state_X_reduced = gr.State()
|
|
@@ -1355,7 +1608,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1355 |
if bucket and bucket != "(any)":
|
| 1356 |
out = out[out["bucket"] == bucket]
|
| 1357 |
if cluster and cluster != "(any)":
|
| 1358 |
-
# Modified to parse the new cluster label format
|
| 1359 |
m = re.match(r"^.*?(\-?\d+)\s+—", cluster)
|
| 1360 |
if m:
|
| 1361 |
cid = int(m.group(1))
|
|
@@ -1387,7 +1639,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1387 |
out = out[out["cluster_id"] != -3]
|
| 1388 |
return out
|
| 1389 |
|
| 1390 |
-
# --------
|
| 1391 |
def social_stats(df: pd.DataFrame) -> pd.DataFrame:
|
| 1392 |
deg = {}
|
| 1393 |
def add_edge(a,b):
|
|
@@ -1405,19 +1657,104 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1405 |
out = pd.DataFrame(rows).sort_values("degree", ascending=False).head(50)
|
| 1406 |
return out
|
| 1407 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1408 |
# -------- Main pipeline --------
|
| 1409 |
def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 1410 |
use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
|
| 1411 |
trusted_domains_in, extra_keywords_in, highlight_toggle,
|
| 1412 |
-
# NEW:
|
| 1413 |
use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
|
| 1414 |
-
per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary
|
|
|
|
| 1415 |
if inbox_file is None:
|
| 1416 |
-
return (
|
| 1417 |
-
|
| 1418 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1419 |
|
| 1420 |
-
# ===
|
| 1421 |
def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
|
| 1422 |
texts = list(df_in.apply(enrich_text, axis=1))
|
| 1423 |
subjects_only = list(df_in["subject"].fillna(""))
|
|
@@ -1516,6 +1853,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1516 |
|
| 1517 |
def _cluster_space(
|
| 1518 |
X_space,
|
|
|
|
| 1519 |
df_part: pd.DataFrame,
|
| 1520 |
use_lsa: bool,
|
| 1521 |
use_hdbscan: bool,
|
|
@@ -1531,11 +1869,15 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1531 |
):
|
| 1532 |
n = X_space.shape[0]
|
| 1533 |
|
|
|
|
|
|
|
|
|
|
| 1534 |
if n <= 1:
|
| 1535 |
labels = np.zeros((n,), dtype=int) if n == 1 else np.array([], dtype=int)
|
| 1536 |
centers = None
|
| 1537 |
chosen_k = int(n) if n > 0 else 0
|
| 1538 |
-
return labels, centers, chosen_k
|
|
|
|
| 1539 |
if n < 10:
|
| 1540 |
k_small = min(max(2, n // 2), n)
|
| 1541 |
kmeans = MiniBatchKMeans(
|
|
@@ -1546,6 +1888,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1546 |
)
|
| 1547 |
labels = kmeans.fit_predict(X_space)
|
| 1548 |
centers = getattr(kmeans, "cluster_centers_", None)
|
|
|
|
| 1549 |
return labels, centers, int(len(set(labels)))
|
| 1550 |
|
| 1551 |
if use_hdbscan and HDBSCAN_OK and isinstance(X_space, np.ndarray) and X_space.shape[0] >= max(50, hdb_min_cluster):
|
|
@@ -1559,9 +1902,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1559 |
)
|
| 1560 |
labels = clusterer.fit_predict(X_space)
|
| 1561 |
centers = None
|
|
|
|
| 1562 |
chosen_k = int(len(set([l for l in labels if l >= 0])))
|
| 1563 |
return labels, centers, chosen_k
|
| 1564 |
|
|
|
|
| 1565 |
if bool(auto_k):
|
| 1566 |
if use_lsa and isinstance(X_space, np.ndarray):
|
| 1567 |
k, _ = choose_k_by_kneedle(X_space, ks=(50, 100, 150, 200, 300, 400, 500))
|
|
@@ -1569,6 +1914,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1569 |
k = auto_k_rule(X_space.shape[0])
|
| 1570 |
else:
|
| 1571 |
k = max(10, int(k_clusters or 350))
|
|
|
|
| 1572 |
|
| 1573 |
init = None
|
| 1574 |
if use_lsa and isinstance(X_space, np.ndarray) and count_vec is not None:
|
|
@@ -1590,9 +1936,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1590 |
centers = kmeans.cluster_centers_ if hasattr(kmeans, "cluster_centers_") else None
|
| 1591 |
if use_lsa and centers is not None:
|
| 1592 |
labels = merge_close_clusters(labels, centers, thresh=0.95)
|
|
|
|
| 1593 |
chosen_k = int(len(set(labels)))
|
| 1594 |
return labels, centers, chosen_k
|
| 1595 |
|
|
|
|
| 1596 |
trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
|
| 1597 |
extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
|
| 1598 |
extra_terms_lower = [t.lower() for t in extra_terms]
|
|
@@ -1600,8 +1948,15 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1600 |
recs = _load_json_records(inbox_file.name)
|
| 1601 |
if not recs:
|
| 1602 |
return ("**No valid records found.**",
|
| 1603 |
-
None, None, None, None, None,
|
| 1604 |
-
None, None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1605 |
|
| 1606 |
normd = []
|
| 1607 |
for r in tqdm(recs, desc="Normalize", leave=False):
|
|
@@ -1611,20 +1966,28 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1611 |
df = pd.DataFrame(normd)
|
| 1612 |
if df.empty:
|
| 1613 |
return ("**No usable email records.**",
|
| 1614 |
-
None, None, None, None, None,
|
| 1615 |
-
None, None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1616 |
|
| 1617 |
df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
|
| 1618 |
df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
|
| 1619 |
df = compute_sentiment_column(df)
|
| 1620 |
|
| 1621 |
-
#
|
| 1622 |
df["bucket"] = df.apply(route_email_row, axis=1)
|
| 1623 |
df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_domain", "")), axis=1)
|
| 1624 |
df["is_notify"] = df.apply(lambda r: is_notification_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_email", ""), r.get("from_domain", "")), axis=1)
|
| 1625 |
df.loc[df["is_news"] == True, "bucket"] = "Newsletters/Alerts"
|
| 1626 |
df.loc[df["is_notify"] == True, "bucket"] = "IT/Security"
|
| 1627 |
|
|
|
|
| 1628 |
flags = []
|
| 1629 |
for _, row in df.iterrows():
|
| 1630 |
f = []
|
|
@@ -1638,15 +2001,18 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1638 |
flags.append(f)
|
| 1639 |
df["flags"] = flags
|
| 1640 |
|
|
|
|
| 1641 |
df_main = df[~df["bucket"].isin(["Newsletters/Alerts", "IT/Security"])].reset_index(drop=True)
|
| 1642 |
df_news = df[df["bucket"] == "Newsletters/Alerts"].reset_index(drop=True)
|
| 1643 |
df_alerts = df[df["bucket"] == "IT/Security"].reset_index(drop=True)
|
| 1644 |
|
|
|
|
| 1645 |
kv = None
|
| 1646 |
emb_dim = 0
|
| 1647 |
if bool(use_embeddings):
|
| 1648 |
kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
|
| 1649 |
|
|
|
|
| 1650 |
parts = []
|
| 1651 |
if bool(per_language):
|
| 1652 |
for bkt, g_bucket in df_main.groupby("bucket", dropna=False):
|
|
@@ -1657,11 +2023,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1657 |
parts.append(((bkt, "all"), grp.copy()))
|
| 1658 |
|
| 1659 |
labels_list, cluster_name_list, anomaly_list = [], [], []
|
| 1660 |
-
bucket_indexers = []
|
| 1661 |
X_reduced_holder = None
|
| 1662 |
term_names_global = {}
|
| 1663 |
single_partition = (len(parts) == 1)
|
| 1664 |
-
d_word_agg, d_char_agg
|
| 1665 |
svd_obj_local, norm_obj_local = None, None
|
| 1666 |
|
| 1667 |
for (bucket_name, _lang), df_part in parts:
|
|
@@ -1697,8 +2063,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1697 |
except Exception:
|
| 1698 |
pass
|
| 1699 |
|
| 1700 |
-
labels, _,
|
| 1701 |
X_space=X_space,
|
|
|
|
| 1702 |
df_part=df_part,
|
| 1703 |
use_lsa=bool(use_lsa) and X_reduced is not None,
|
| 1704 |
use_hdbscan=bool(use_hdbscan),
|
|
@@ -1713,25 +2080,21 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1713 |
d_word=d_word,
|
| 1714 |
d_char=d_char,
|
| 1715 |
)
|
| 1716 |
-
labels = stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35)
|
| 1717 |
-
k_agg += len(set(labels))
|
| 1718 |
|
| 1719 |
term_names = cluster_labels_pmi_bigram(
|
| 1720 |
texts=texts, labels=labels, subjects=subjects_only,
|
| 1721 |
topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
|
| 1722 |
)
|
| 1723 |
|
| 1724 |
-
# collect to write back in one go
|
| 1725 |
bucket_indexers.append(df_part.index)
|
| 1726 |
labels_list.append(pd.Series(labels, index=df_part.index))
|
| 1727 |
cluster_name_list.append(pd.Series([term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels], index=df_part.index))
|
| 1728 |
anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
|
| 1729 |
-
# remember term names for the reader pill
|
| 1730 |
term_names_global.update({int(k): v for k, v in term_names.items()})
|
| 1731 |
|
| 1732 |
if single_partition and X_reduced is not None:
|
| 1733 |
X_reduced_holder = X_reduced
|
| 1734 |
-
|
| 1735 |
if labels_list:
|
| 1736 |
df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_id"] = pd.concat(labels_list).sort_index()
|
| 1737 |
df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_name"] = pd.concat(cluster_name_list).sort_index()
|
|
@@ -1741,6 +2104,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1741 |
df_main["cluster_name"] = "unclustered"
|
| 1742 |
df_main["anomaly_score"] = np.nan
|
| 1743 |
|
|
|
|
| 1744 |
if len(df_news):
|
| 1745 |
df_news.loc[:, "cluster_id"] = -1
|
| 1746 |
df_news.loc[:, "cluster_name"] = "newsletter/news"
|
|
@@ -1750,10 +2114,22 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1750 |
df_alerts.loc[:, "cluster_name"] = "system/alerts"
|
| 1751 |
df_alerts.loc[:, "anomaly_score"] = np.nan
|
| 1752 |
|
|
|
|
| 1753 |
df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1754 |
df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
|
| 1755 |
df = compute_context_anomaly(df)
|
| 1756 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1757 |
index_obj = None
|
| 1758 |
use_faiss_flag = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
|
| 1759 |
if use_faiss_flag:
|
|
@@ -1770,6 +2146,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1770 |
except Exception:
|
| 1771 |
pass
|
| 1772 |
|
|
|
|
| 1773 |
cluster_counts = (
|
| 1774 |
df.groupby(["bucket", "cluster_id", "cluster_name"])
|
| 1775 |
.size()
|
|
@@ -1816,9 +2193,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1816 |
norm_obj_out = norm_obj_local if single_partition else None
|
| 1817 |
|
| 1818 |
return (
|
| 1819 |
-
status_md,
|
| 1820 |
-
|
| 1821 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1822 |
gr.update(choices=cluster_choices, value="(any)"),
|
| 1823 |
gr.update(choices=domain_choices, value="(any)"),
|
| 1824 |
gr.update(choices=sender_choices, value="(any)"),
|
|
@@ -1828,6 +2210,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1828 |
gr.update(choices=bucket_choices, value="(any)")
|
| 1829 |
)
|
| 1830 |
|
|
|
|
| 1831 |
(run_btn.click)(
|
| 1832 |
process_file,
|
| 1833 |
inputs=[
|
|
@@ -1836,46 +2219,24 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1836 |
trusted_domains_in, extra_keywords_in, highlight_toggle,
|
| 1837 |
use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
|
| 1838 |
per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
|
|
|
|
| 1839 |
],
|
| 1840 |
outputs=[
|
| 1841 |
status, cluster_counts_df, domain_counts_df, sender_counts_df,
|
| 1842 |
-
actors_df, offhours_df,
|
| 1843 |
-
|
|
|
|
|
|
|
|
|
|
| 1844 |
state_use_lsa, state_use_faiss,
|
| 1845 |
cluster_drop, domain_drop, sender_drop, lang_drop,
|
| 1846 |
-
state_svd, state_norm, state_dims,
|
|
|
|
| 1847 |
bucket_drop,
|
| 1848 |
],
|
| 1849 |
)
|
| 1850 |
|
| 1851 |
# -------- Filtering & Search --------
|
| 1852 |
-
def _sort_results(df, by, direction):
|
| 1853 |
-
if df is None or len(df) == 0:
|
| 1854 |
-
return pd.DataFrame()
|
| 1855 |
-
tmp = df.copy()
|
| 1856 |
-
if "date" in tmp.columns:
|
| 1857 |
-
tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
|
| 1858 |
-
else:
|
| 1859 |
-
tmp["_dt"] = pd.NaT
|
| 1860 |
-
by = by if by in tmp.columns else "context_anomaly_score"
|
| 1861 |
-
asc = (direction == "asc")
|
| 1862 |
-
sort_cols = [by]
|
| 1863 |
-
if by == "date":
|
| 1864 |
-
sort_cols = ["_dt"]
|
| 1865 |
-
elif by in ["anomaly_score", "corruption_score", "context_anomaly_score"]:
|
| 1866 |
-
sort_cols.append("_dt")
|
| 1867 |
-
|
| 1868 |
-
tmp = tmp.sort_values(sort_cols, ascending=[asc, False])
|
| 1869 |
-
|
| 1870 |
-
cols_out = [
|
| 1871 |
-
"date","bucket","from_email","from_domain","subject","cluster_name","lang",
|
| 1872 |
-
"tags","flags","sentiment","context_anomaly_score","corruption_score","anomaly_score"
|
| 1873 |
-
]
|
| 1874 |
-
if "search_score" in tmp.columns:
|
| 1875 |
-
cols_out.append("search_score")
|
| 1876 |
-
|
| 1877 |
-
return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
|
| 1878 |
-
|
| 1879 |
def refresh_results(df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
|
| 1880 |
if df is None or len(df) == 0:
|
| 1881 |
return pd.DataFrame()
|
|
@@ -1884,7 +2245,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1884 |
)
|
| 1885 |
return _sort_results(filt, sort_by, sort_dir)
|
| 1886 |
|
| 1887 |
-
# Re-run when any filter control changes
|
| 1888 |
for ctrl in [bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
|
| 1889 |
date_start, date_end, sort_by, sort_dir, hide_noise]:
|
| 1890 |
ctrl.change(
|
|
@@ -1911,57 +2272,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1911 |
outputs=[results_df],
|
| 1912 |
)
|
| 1913 |
|
| 1914 |
-
# ---
|
| 1915 |
-
def _tokenize_query(q: str) -> List[str]:
|
| 1916 |
-
return [p.strip() for p in re.split(r"\s+", q or "") if p.strip()][:8]
|
| 1917 |
-
|
| 1918 |
-
def _project_query_to_lsa(q_vec, svd, norm) -> Optional[np.ndarray]:
|
| 1919 |
-
try:
|
| 1920 |
-
return norm.transform(svd.transform(q_vec)).astype(np.float32)
|
| 1921 |
-
except Exception:
|
| 1922 |
-
return None
|
| 1923 |
-
|
| 1924 |
-
def _vectorize_query(q, vec_state, corpus_texts):
|
| 1925 |
-
# Build the same features for the query that we used for docs
|
| 1926 |
-
char_min_df = 1 if len(corpus_texts) <= 1 else 2
|
| 1927 |
-
|
| 1928 |
-
if vec_state.get("use_hashing"):
|
| 1929 |
-
hv = HashingVectorizer(
|
| 1930 |
-
analyzer="word",
|
| 1931 |
-
ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
|
| 1932 |
-
n_features=2 ** vec_state.get("hash_bits", 18),
|
| 1933 |
-
token_pattern=TOKEN_PATTERN,
|
| 1934 |
-
lowercase=True,
|
| 1935 |
-
norm=None,
|
| 1936 |
-
alternate_sign=False,
|
| 1937 |
-
)
|
| 1938 |
-
# Fit TF-IDF weights from corpus
|
| 1939 |
-
counts = hv.transform(corpus_texts)
|
| 1940 |
-
tfidf_tr = TfidfTransformer().fit(counts)
|
| 1941 |
-
q_word = tfidf_tr.transform(hv.transform([q]))
|
| 1942 |
-
else:
|
| 1943 |
-
cv = CountVectorizer(
|
| 1944 |
-
analyzer="word",
|
| 1945 |
-
ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
|
| 1946 |
-
max_features=vec_state.get("max_features"),
|
| 1947 |
-
min_df=vec_state.get("min_df"),
|
| 1948 |
-
max_df=vec_state.get("max_df"),
|
| 1949 |
-
token_pattern=TOKEN_PATTERN,
|
| 1950 |
-
lowercase=True,
|
| 1951 |
-
stop_words=STOPWORD_FOR_VEC,
|
| 1952 |
-
dtype=np.float32,
|
| 1953 |
-
)
|
| 1954 |
-
tf = cv.fit_transform(corpus_texts)
|
| 1955 |
-
bm25 = BM25Transformer().fit(tf)
|
| 1956 |
-
q_word = bm25.transform(cv.transform([q]))
|
| 1957 |
-
|
| 1958 |
-
char_vec = CharTfidf(
|
| 1959 |
-
analyzer="char", ngram_range=(3, 5), min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32
|
| 1960 |
-
).fit(corpus_texts)
|
| 1961 |
-
q_char = char_vec.transform([q])
|
| 1962 |
-
|
| 1963 |
-
return hstack([q_word, q_char * 0.20], format="csr")
|
| 1964 |
-
|
| 1965 |
def search_fn(q, df, vec, X_red, index, use_lsa, use_faiss, svd, norm, sort, sdir):
|
| 1966 |
if not q or df is None or vec is None or index is None:
|
| 1967 |
return pd.DataFrame(), []
|
|
@@ -1984,7 +2295,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1984 |
return pd.DataFrame(), q_terms
|
| 1985 |
|
| 1986 |
if isinstance(index, NearestNeighbors):
|
| 1987 |
-
# brute-force cosine on reduced space
|
| 1988 |
if hasattr(index, "n_samples_fit_") and index.n_samples_fit_ <= 1:
|
| 1989 |
return pd.DataFrame(), q_terms
|
| 1990 |
dists, inds = index.kneighbors(q_emb, n_neighbors=n_req)
|
|
@@ -2009,7 +2319,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 2009 |
outputs=[results_df, state_query_terms],
|
| 2010 |
)
|
| 2011 |
|
| 2012 |
-
# ---
|
| 2013 |
def on_row_select(evt: gr.SelectData, table, df, term_names, q_terms, extra_terms, do_highlight):
|
| 2014 |
if evt.index is None or table is None or len(table) == 0 or df is None or len(df) == 0:
|
| 2015 |
return ""
|
|
@@ -2044,7 +2354,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 2044 |
outputs=[email_view],
|
| 2045 |
)
|
| 2046 |
|
| 2047 |
-
# Click-to-filter
|
| 2048 |
def on_click_filter(evt: gr.SelectData, df_sum: pd.DataFrame, col_name: str, out_comp: gr.Dropdown):
|
| 2049 |
if evt.index is None or df_sum is None or df_sum.empty:
|
| 2050 |
return gr.update()
|
|
@@ -2057,7 +2367,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 2057 |
r = df_sum.iloc[evt.index[0]]
|
| 2058 |
return gr.update(value=r["bucket"]), gr.update(value=r["label"])
|
| 2059 |
|
| 2060 |
-
# Cluster summary → set bucket & cluster filter
|
| 2061 |
cluster_counts_df.select(
|
| 2062 |
on_cluster_summary_select, [cluster_counts_df], [bucket_drop, cluster_drop]
|
| 2063 |
).then(
|
|
@@ -2069,7 +2378,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 2069 |
outputs=[results_df],
|
| 2070 |
)
|
| 2071 |
|
| 2072 |
-
# Domain summary → set domain filter
|
| 2073 |
domain_counts_df.select(
|
| 2074 |
lambda evt, df: on_click_filter(evt, df, "from_domain", domain_drop), [domain_counts_df], [domain_drop]
|
| 2075 |
).then(
|
|
@@ -2081,7 +2389,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 2081 |
outputs=[results_df],
|
| 2082 |
)
|
| 2083 |
|
| 2084 |
-
# Sender summary → set sender filter
|
| 2085 |
sender_counts_df.select(
|
| 2086 |
lambda evt, df: on_click_filter(evt, df, "from_email", sender_drop), [sender_counts_df], [sender_drop]
|
| 2087 |
).then(
|
|
@@ -2095,4 +2402,5 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 2095 |
|
| 2096 |
if __name__ == "__main__":
|
| 2097 |
# Disable SSR to avoid handler arity warnings under server-side rendering
|
| 2098 |
-
demo.launch(ssr_mode=False)
|
|
|
|
|
|
| 81 |
LOBBY_DOMAINS = set() # e.g., {"acme-lobby.com"}
|
| 82 |
LEGAL_DOMAINS = set() # e.g., {"biglaw.com","firmlaw.com"}
|
| 83 |
|
| 84 |
+
def _contains_any(text: str, terms: list) -> bool:
|
| 85 |
if not text or not terms: return False
|
| 86 |
tl = text.lower()
|
| 87 |
return any(t for t in terms if t and t.lower() in tl)
|
|
|
|
| 111 |
|
| 112 |
def route_email_row(row: pd.Series) -> str:
|
| 113 |
text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
|
| 114 |
+
scores: dict = {b: 0.0 for b in TAXONOMY.keys()}
|
| 115 |
# lexicon points
|
| 116 |
for b, terms in TAXONOMY.items():
|
| 117 |
if not terms:
|
|
|
|
| 1194 |
df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
|
| 1195 |
return df.drop(columns=["_if_pct","_if_pts","_rule_pts","_corr_pts"], errors="ignore")
|
| 1196 |
|
| 1197 |
+
# =================== 🔧 NEW: Per-bucket k & stabilizer params ===================
|
| 1198 |
+
def _bucket_k_multiplier(bucket_name: str) -> float:
|
| 1199 |
+
b = (bucket_name or "").lower()
|
| 1200 |
+
if b in ("constituent",): return 1.25
|
| 1201 |
+
if b in ("procurement", "campaign finance", "receipts/billing", "lobbyist"): return 1.15
|
| 1202 |
+
if b in ("scheduling", "other"): return 1.00
|
| 1203 |
+
if b in ("legal",): return 0.80
|
| 1204 |
+
return 1.00
|
| 1205 |
+
|
| 1206 |
+
def _bucket_stabilizer_params(bucket_name: str) -> Tuple[int, float, float]:
|
| 1207 |
+
b = (bucket_name or "").lower()
|
| 1208 |
+
if b == "legal": return (30, 0.97, 0.38)
|
| 1209 |
+
if b == "procurement": return (35, 0.96, 0.36)
|
| 1210 |
+
if b == "campaign finance": return (35, 0.96, 0.36)
|
| 1211 |
+
if b == "constituent": return (40, 0.95, 0.33)
|
| 1212 |
+
if b == "receipts/billing": return (40, 0.95, 0.35)
|
| 1213 |
+
if b == "scheduling": return (35, 0.95, 0.35)
|
| 1214 |
+
return (40, 0.96, 0.35)
|
| 1215 |
+
|
| 1216 |
+
# =================== 🔧 NEW: Label de-dup helpers ===================
|
| 1217 |
+
def _normalize_label_tokens(label: str) -> set:
|
| 1218 |
+
if not label: return set()
|
| 1219 |
+
txt = str(label).lower()
|
| 1220 |
+
toks = re.findall(r"[a-z\u0590-\u05FF][a-z\u0590-\u05FF\-']{1,}", txt)
|
| 1221 |
+
toks2 = [t[:-1] if len(t) > 3 and t.endswith("s") else t for t in toks]
|
| 1222 |
+
return {t for t in toks2 if t not in STOP_TERMS and t not in EN_STOP and t not in HE_STOP and len(t) >= 2}
|
| 1223 |
+
|
| 1224 |
+
def _jaccard(a: set, b: set) -> float:
|
| 1225 |
+
if not a or not b: return 0.0
|
| 1226 |
+
inter = len(a & b)
|
| 1227 |
+
if inter == 0: return 0.0
|
| 1228 |
+
return inter / float(len(a | b))
|
| 1229 |
+
|
| 1230 |
+
def dedupe_cluster_labels_in_bucket(df: pd.DataFrame, bucket: str, sim_thresh: float = 0.72) -> pd.DataFrame:
|
| 1231 |
+
sel = df[df["bucket"] == bucket].copy()
|
| 1232 |
+
if sel.empty or "cluster_name" not in sel.columns:
|
| 1233 |
+
return df
|
| 1234 |
+
|
| 1235 |
+
names = sel[["cluster_id", "cluster_name"]].drop_duplicates()
|
| 1236 |
+
tokens = {int(cid): _normalize_label_tokens(str(name)) for cid, name in names.values}
|
| 1237 |
+
ids = list(tokens.keys())
|
| 1238 |
+
|
| 1239 |
+
parent = {i: i for i in ids}
|
| 1240 |
+
def find(i):
|
| 1241 |
+
while parent[i] != i: i = parent[i]
|
| 1242 |
+
return i
|
| 1243 |
+
def union(a, b):
|
| 1244 |
+
ra, rb = find(a), find(b)
|
| 1245 |
+
if ra != rb: parent[rb] = ra
|
| 1246 |
+
|
| 1247 |
+
for i in range(len(ids)):
|
| 1248 |
+
for j in range(i+1, len(ids)):
|
| 1249 |
+
if _jaccard(tokens[ids[i]], tokens[ids[j]]) >= sim_thresh:
|
| 1250 |
+
union(ids[i], ids[j])
|
| 1251 |
+
|
| 1252 |
+
names_map = dict(names.values)
|
| 1253 |
+
comp_to_canon = {}
|
| 1254 |
+
for cid in ids:
|
| 1255 |
+
root = find(cid)
|
| 1256 |
+
comp_to_canon.setdefault(root, [])
|
| 1257 |
+
comp_to_canon[root].append((cid, names_map.get(cid, "")))
|
| 1258 |
+
|
| 1259 |
+
canon_for_cluster = {}
|
| 1260 |
+
for root, items in comp_to_canon.items():
|
| 1261 |
+
best = max(items, key=lambda kv: (len(kv[1] or ""), kv[1]))
|
| 1262 |
+
for cid, _ in items:
|
| 1263 |
+
canon_for_cluster[cid] = best[1]
|
| 1264 |
+
|
| 1265 |
+
df.loc[sel.index, "cluster_name"] = sel["cluster_id"].map(lambda c: canon_for_cluster.get(int(c), names_map.get(int(c), "")))
|
| 1266 |
+
return df
|
| 1267 |
+
|
| 1268 |
+
def dedupe_all_labels(df: pd.DataFrame) -> pd.DataFrame:
|
| 1269 |
+
out = df
|
| 1270 |
+
for bkt in sorted(df["bucket"].dropna().unique()):
|
| 1271 |
+
out = dedupe_cluster_labels_in_bucket(out, bkt, sim_thresh=0.72)
|
| 1272 |
+
return out
|
| 1273 |
+
|
| 1274 |
+
# =================== 🔎 NEW: Surveillance-campaign detection ===================
|
| 1275 |
+
SURV_KEYWORDS = [
|
| 1276 |
+
"daily report","daily brief","briefing","sitreps","sitrep","situation report","summary",
|
| 1277 |
+
"dossier","monitoring","tracking","watchlist","watch list","profile","surveillance",
|
| 1278 |
+
"intel","intelligence","osint","open source intel","clippings","press clips","digest",
|
| 1279 |
+
"alert","alerting","dispatch","bulletin","roundup","update"
|
| 1280 |
+
]
|
| 1281 |
+
SURV_RE = re.compile("|".join([re.escape(k) for k in SURV_KEYWORDS]), re.I)
|
| 1282 |
+
|
| 1283 |
+
SUBJ_NUM_RE = re.compile(r"\b\d{1,4}([,./-]\d{1,4})*\b")
|
| 1284 |
+
SUBJ_DATE_RE = re.compile(r"\b(?:\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?|\d{4}-\d{2}-\d{2}|jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)\b", re.I)
|
| 1285 |
+
SUBJ_FW_RE = re.compile(r"^\s*(re:|fw:|fwd:)\s*", re.I)
|
| 1286 |
+
EMAIL_RE = re.compile(r"\b[\w.\-+%]+@[\w.-]+\.[A-Za-z]{2,}\b")
|
| 1287 |
+
|
| 1288 |
+
def _candidate_entities_from_subjects(df: pd.DataFrame, extra_watchlist: List[str]) -> List[str]:
|
| 1289 |
+
cand = set([w.strip() for w in (extra_watchlist or []) if w.strip()])
|
| 1290 |
+
subs = df["subject"].dropna().astype(str).tolist()
|
| 1291 |
+
pat = re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\b")
|
| 1292 |
+
for s in subs:
|
| 1293 |
+
for m in pat.finditer(s):
|
| 1294 |
+
name = m.group(1).strip()
|
| 1295 |
+
if name.lower() in EN_STOP or len(name) < 5:
|
| 1296 |
+
continue
|
| 1297 |
+
cand.add(name)
|
| 1298 |
+
out = sorted(cand)
|
| 1299 |
+
return out[:3000]
|
| 1300 |
+
|
| 1301 |
+
def _normalize_subject_template(subj: str, entity: str) -> str:
|
| 1302 |
+
if not subj: return ""
|
| 1303 |
+
s = SUBJ_FW_RE.sub("", subj)
|
| 1304 |
+
try:
|
| 1305 |
+
s = re.sub(re.escape(entity), "«ENTITY»", s, flags=re.I)
|
| 1306 |
+
except Exception:
|
| 1307 |
+
pass
|
| 1308 |
+
s = SUBJ_DATE_RE.sub("«DATE»", s)
|
| 1309 |
+
s = SUBJ_NUM_RE.sub("«NUM»", s)
|
| 1310 |
+
s = EMAIL_RE.sub("«EMAIL»", s)
|
| 1311 |
+
s = re.sub(r"\s+", " ", s).strip().lower()
|
| 1312 |
+
return s
|
| 1313 |
+
|
| 1314 |
+
def _entity_mask_present(row: pd.Series, entity: str) -> bool:
|
| 1315 |
+
t = (row.get("subject","") + " " + row.get("body_text","")).lower()
|
| 1316 |
+
e = (entity or "").lower()
|
| 1317 |
+
return (e in t) if e else False
|
| 1318 |
+
|
| 1319 |
+
def detect_surveillance_campaigns(
|
| 1320 |
+
df: pd.DataFrame,
|
| 1321 |
+
watchlist: Optional[List[str]] = None,
|
| 1322 |
+
min_mentions: int = 15,
|
| 1323 |
+
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 1324 |
+
if df.empty:
|
| 1325 |
+
return pd.DataFrame(), pd.DataFrame()
|
| 1326 |
+
|
| 1327 |
+
watch = [w.strip() for w in (watchlist or []) if w.strip()]
|
| 1328 |
+
cands = _candidate_entities_from_subjects(df, watch)
|
| 1329 |
+
|
| 1330 |
+
dfd = df.copy()
|
| 1331 |
+
dfd["_dt"] = pd.to_datetime(dfd["date"], utc=True, errors="coerce")
|
| 1332 |
+
dfd["_day"] = dfd["_dt"].dt.date
|
| 1333 |
+
dfd["_week"] = dfd["_dt"].dt.to_period("W").astype(str)
|
| 1334 |
+
dfd["_is_news"] = (dfd["bucket"] == "Newsletters/Alerts")
|
| 1335 |
+
dfd["_is_it"] = (dfd["bucket"] == "IT/Security")
|
| 1336 |
+
dfd["_is_internal"] = ~(dfd["_is_news"] | dfd["_is_it"])
|
| 1337 |
+
dfd["_recips"] = dfd["to_emails"].apply(lambda xs: len(xs) if isinstance(xs, list) else 0)
|
| 1338 |
+
|
| 1339 |
+
rows = []
|
| 1340 |
+
samples = []
|
| 1341 |
+
|
| 1342 |
+
for entity in cands:
|
| 1343 |
+
mask = dfd.apply(lambda r: _entity_mask_present(r, entity), axis=1)
|
| 1344 |
+
grp = dfd[mask]
|
| 1345 |
+
n = len(grp)
|
| 1346 |
+
if n < int(min_mentions):
|
| 1347 |
+
continue
|
| 1348 |
+
|
| 1349 |
+
n_senders = grp["from_email"].nunique()
|
| 1350 |
+
n_domains = grp["from_domain"].nunique()
|
| 1351 |
+
pct_news = float((grp["_is_news"].mean() if n else 0.0))
|
| 1352 |
+
pct_int = float((grp["_is_internal"].mean() if n else 0.0))
|
| 1353 |
+
avg_recips = float((grp["_recips"].mean() if n else 0.0))
|
| 1354 |
+
|
| 1355 |
+
wk = grp.groupby("_week").size().astype(float)
|
| 1356 |
+
if len(wk) >= 4:
|
| 1357 |
+
baseline = wk.iloc[:-1]
|
| 1358 |
+
mu = float(baseline.mean()) if len(baseline) else 0.0
|
| 1359 |
+
sd = float(baseline.std(ddof=1)) if len(baseline) > 1 else 0.0
|
| 1360 |
+
last = float(wk.iloc[-1])
|
| 1361 |
+
weekly_peak_z = 0.0 if sd == 0.0 else (last - mu) / sd
|
| 1362 |
+
else:
|
| 1363 |
+
weekly_peak_z = 0.0
|
| 1364 |
+
|
| 1365 |
+
norm_subj = grp["subject"].fillna("").astype(str).map(lambda s: _normalize_subject_template(s, entity))
|
| 1366 |
+
if len(norm_subj):
|
| 1367 |
+
top_template_share = norm_subj.value_counts(normalize=True).iloc[0]
|
| 1368 |
+
else:
|
| 1369 |
+
top_template_share = 0.0
|
| 1370 |
+
|
| 1371 |
+
kw_share = float(((grp["subject"].fillna("") + " " + grp["body_text"].fillna("")).str.contains(SURV_RE).mean()) if n else 0.0)
|
| 1372 |
+
|
| 1373 |
+
score = 0.0
|
| 1374 |
+
score += 2.5 * min(1.0, top_template_share)
|
| 1375 |
+
score += 2.0 * min(1.0, kw_share)
|
| 1376 |
+
score += 1.5 * min(1.0, weekly_peak_z / 3.0)
|
| 1377 |
+
score += 0.8 * min(1.0, n_senders / 10.0)
|
| 1378 |
+
score += 0.5 * min(1.0, n_domains / 10.0)
|
| 1379 |
+
score += 1.0 * pct_int
|
| 1380 |
+
score += 0.3 * min(1.0, avg_recips / 10.0)
|
| 1381 |
+
|
| 1382 |
+
level = "info"
|
| 1383 |
+
if score >= 6.5: level = "likely"
|
| 1384 |
+
elif score >= 4.5: level = "possible"
|
| 1385 |
+
|
| 1386 |
+
first_d = grp["_dt"].min()
|
| 1387 |
+
last_d = grp["_dt"].max()
|
| 1388 |
+
|
| 1389 |
+
rows.append({
|
| 1390 |
+
"entity": entity,
|
| 1391 |
+
"surveillance_score": round(float(score), 3),
|
| 1392 |
+
"level": level,
|
| 1393 |
+
"n_emails": int(n),
|
| 1394 |
+
"n_senders": int(n_senders),
|
| 1395 |
+
"n_domains": int(n_domains),
|
| 1396 |
+
"pct_newsletters": round(pct_news, 3),
|
| 1397 |
+
"pct_internal": round(pct_int, 3),
|
| 1398 |
+
"avg_recipients": round(avg_recips, 2),
|
| 1399 |
+
"weekly_peak_z": round(float(weekly_peak_z), 3),
|
| 1400 |
+
"template_max_share": round(float(top_template_share), 3),
|
| 1401 |
+
"keyword_share": round(float(kw_share), 3),
|
| 1402 |
+
"first_date": str(first_d) if pd.notna(first_d) else "",
|
| 1403 |
+
"last_date": str(last_d) if pd.notna(last_d) else "",
|
| 1404 |
+
"notes": "template/keywords/cadence/senders/domains mix"
|
| 1405 |
+
})
|
| 1406 |
+
|
| 1407 |
+
ex = grp[["date","from_email","from_domain","subject","bucket"]].copy().head(8)
|
| 1408 |
+
ex.insert(0, "entity", entity)
|
| 1409 |
+
samples.append(ex)
|
| 1410 |
+
|
| 1411 |
+
ent_df = pd.DataFrame(rows).sort_values(["surveillance_score","n_emails"], ascending=[False, False]).head(200)
|
| 1412 |
+
samp_df = pd.concat(samples, ignore_index=True) if samples else pd.DataFrame()
|
| 1413 |
+
|
| 1414 |
+
return ent_df, samp_df
|
| 1415 |
+
|
| 1416 |
+
def tag_surveillance_emails(df: pd.DataFrame, ent_df: pd.DataFrame, threshold: float = 4.5) -> pd.DataFrame:
|
| 1417 |
+
if df.empty or ent_df.empty:
|
| 1418 |
+
return df
|
| 1419 |
+
hot = ent_df[ent_df["surveillance_score"] >= float(threshold)]["entity"].tolist()
|
| 1420 |
+
if not hot: return df
|
| 1421 |
+
def _tag(row):
|
| 1422 |
+
txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
|
| 1423 |
+
tags = set(row.get("tags") or [])
|
| 1424 |
+
for e in hot:
|
| 1425 |
+
if e.lower() in txt:
|
| 1426 |
+
tags.add("surveillance")
|
| 1427 |
+
break
|
| 1428 |
+
return sorted(tags)
|
| 1429 |
+
out = df.copy()
|
| 1430 |
+
out["tags"] = out.apply(_tag, axis=1)
|
| 1431 |
+
return out
|
| 1432 |
+
|
| 1433 |
+
# =================== UI / PIPELINE CONTINUATION ===================
|
| 1434 |
+
|
| 1435 |
+
# ---------- Styles ----------
|
| 1436 |
CSS = """
|
| 1437 |
:root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
|
| 1438 |
.email-card { background:#ffffff; color:#111827; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.08); }
|
|
|
|
| 1451 |
.cursor { cursor:pointer; }
|
| 1452 |
"""
|
| 1453 |
|
| 1454 |
+
# ---------- App ----------
|
| 1455 |
+
with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Surveillance Radar", css=CSS, theme="soft") as demo:
|
| 1456 |
gr.Markdown("# Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans")
|
| 1457 |
+
gr.Markdown(
|
| 1458 |
+
"This build includes per-bucket **k** heuristics, label **de-dup**, and a **surveillance-campaign detector** "
|
| 1459 |
+
"(template cadence + keywords + multi-sender/domain signals)."
|
| 1460 |
+
)
|
| 1461 |
|
| 1462 |
with gr.Row():
|
| 1463 |
inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
|
|
|
|
| 1476 |
use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
|
| 1477 |
lsa_dim = gr.Number(label="LSA components", value=256, precision=0)
|
| 1478 |
auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
|
| 1479 |
+
k_clusters = gr.Number(label="Base k (before per-bucket multiplier)", value=350, precision=0)
|
| 1480 |
mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
|
| 1481 |
with gr.Row():
|
| 1482 |
use_hdbscan = gr.Checkbox(label="Use HDBSCAN (auto-k, noise) on reduced vectors", value=False)
|
|
|
|
| 1506 |
sender_drop = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
|
| 1507 |
lang_drop = gr.Dropdown(label="Language", choices=["(any)"], value="(any)", allow_custom_value=False)
|
| 1508 |
sentiment_drop = gr.Dropdown(label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)")
|
| 1509 |
+
tag_drop = gr.Dropdown(label="Tag", choices=["(any)", "🚩suspect", "finance", "off-channel", "surveillance", "odd-hours", "personal-mail"], value="(any)")
|
| 1510 |
with gr.Row():
|
| 1511 |
date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
|
| 1512 |
date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
|
| 1513 |
sort_by = gr.Dropdown(label="Sort by", choices=["context_anomaly_score","corruption_score","date","anomaly_score","search_score"], value="context_anomaly_score")
|
| 1514 |
sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
|
| 1515 |
+
with gr.Row():
|
| 1516 |
+
hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
|
| 1517 |
+
|
| 1518 |
+
with gr.Accordion("Surveillance Radar", open=True):
|
| 1519 |
+
with gr.Row():
|
| 1520 |
+
watchlist_in = gr.Textbox(label="Watchlist (names or entities, comma-separated)", value="Hillary Clinton, Joe Biden, Donald Trump")
|
| 1521 |
+
min_mentions = gr.Number(label="Min mentions per entity", value=15, precision=0)
|
| 1522 |
|
| 1523 |
with gr.Row():
|
| 1524 |
run_btn = gr.Button("Process", variant="primary")
|
|
|
|
| 1535 |
actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
|
| 1536 |
offhours_df = gr.Dataframe(label="Off-hours & personal-mail hits", interactive=False, wrap=True)
|
| 1537 |
|
| 1538 |
+
gr.Markdown("### Surveillance Campaigns (detected entities)")
|
| 1539 |
+
with gr.Row():
|
| 1540 |
+
surv_entities_df = gr.Dataframe(label="Entities ranked by surveillance score", interactive=False, wrap=True)
|
| 1541 |
+
surv_samples_df = gr.Dataframe(label="Sample emails for highlighted entities", interactive=False, wrap=True)
|
| 1542 |
+
|
| 1543 |
gr.Markdown("### Search")
|
| 1544 |
with gr.Row():
|
| 1545 |
search_query = gr.Textbox(label="Search (keywords, names, etc.)")
|
|
|
|
| 1547 |
results_df = gr.Dataframe(label="Results (top 500 or top 50 for search)", interactive=True, wrap=True)
|
| 1548 |
email_view = gr.HTML(label="Reader")
|
| 1549 |
|
| 1550 |
+
# -------- State --------
|
| 1551 |
state_df = gr.State()
|
| 1552 |
state_vec = gr.State()
|
| 1553 |
state_X_reduced = gr.State()
|
|
|
|
| 1608 |
if bucket and bucket != "(any)":
|
| 1609 |
out = out[out["bucket"] == bucket]
|
| 1610 |
if cluster and cluster != "(any)":
|
|
|
|
| 1611 |
m = re.match(r"^.*?(\-?\d+)\s+—", cluster)
|
| 1612 |
if m:
|
| 1613 |
cid = int(m.group(1))
|
|
|
|
| 1639 |
out = out[out["cluster_id"] != -3]
|
| 1640 |
return out
|
| 1641 |
|
| 1642 |
+
# -------- Social graph summary --------
|
| 1643 |
def social_stats(df: pd.DataFrame) -> pd.DataFrame:
|
| 1644 |
deg = {}
|
| 1645 |
def add_edge(a,b):
|
|
|
|
| 1657 |
out = pd.DataFrame(rows).sort_values("degree", ascending=False).head(50)
|
| 1658 |
return out
|
| 1659 |
|
| 1660 |
+
# -------- Sorting helper --------
|
| 1661 |
+
def _sort_results(df, by, direction):
|
| 1662 |
+
if df is None or len(df) == 0:
|
| 1663 |
+
return pd.DataFrame()
|
| 1664 |
+
tmp = df.copy()
|
| 1665 |
+
if "date" in tmp.columns:
|
| 1666 |
+
tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
|
| 1667 |
+
else:
|
| 1668 |
+
tmp["_dt"] = pd.NaT
|
| 1669 |
+
by = by if by in tmp.columns else "context_anomaly_score"
|
| 1670 |
+
asc = (direction == "asc")
|
| 1671 |
+
sort_cols = [by]
|
| 1672 |
+
if by == "date":
|
| 1673 |
+
sort_cols = ["_dt"]
|
| 1674 |
+
elif by in ["anomaly_score", "corruption_score", "context_anomaly_score"]:
|
| 1675 |
+
sort_cols.append("_dt")
|
| 1676 |
+
tmp = tmp.sort_values(sort_cols, ascending=[asc, False])
|
| 1677 |
+
cols_out = [
|
| 1678 |
+
"date","bucket","from_email","from_domain","subject","cluster_name","lang",
|
| 1679 |
+
"tags","flags","sentiment","context_anomaly_score","corruption_score","anomaly_score"
|
| 1680 |
+
]
|
| 1681 |
+
if "search_score" in tmp.columns:
|
| 1682 |
+
cols_out.append("search_score")
|
| 1683 |
+
return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
|
| 1684 |
+
|
| 1685 |
+
# -------- Vectorization helpers (mirror training path for queries) --------
|
| 1686 |
+
def _tokenize_query(q: str) -> List[str]:
|
| 1687 |
+
return [p.strip() for p in re.split(r"\s+", q or "") if p.strip()][:8]
|
| 1688 |
+
|
| 1689 |
+
def _project_query_to_lsa(q_vec, svd, norm) -> Optional[np.ndarray]:
|
| 1690 |
+
try:
|
| 1691 |
+
return norm.transform(svd.transform(q_vec)).astype(np.float32)
|
| 1692 |
+
except Exception:
|
| 1693 |
+
return None
|
| 1694 |
+
|
| 1695 |
+
def _vectorize_query(q, vec_state, corpus_texts):
|
| 1696 |
+
# Build the same features for the query that we used for docs
|
| 1697 |
+
char_min_df = 1 if len(corpus_texts) <= 1 else 2
|
| 1698 |
+
|
| 1699 |
+
if vec_state.get("use_hashing"):
|
| 1700 |
+
hv = HashingVectorizer(
|
| 1701 |
+
analyzer="word",
|
| 1702 |
+
ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
|
| 1703 |
+
n_features=2 ** vec_state.get("hash_bits", 18),
|
| 1704 |
+
token_pattern=TOKEN_PATTERN,
|
| 1705 |
+
lowercase=True,
|
| 1706 |
+
norm=None,
|
| 1707 |
+
alternate_sign=False,
|
| 1708 |
+
)
|
| 1709 |
+
counts = hv.transform(corpus_texts)
|
| 1710 |
+
tfidf_tr = TfidfTransformer().fit(counts)
|
| 1711 |
+
q_word = tfidf_tr.transform(hv.transform([q]))
|
| 1712 |
+
else:
|
| 1713 |
+
cv = CountVectorizer(
|
| 1714 |
+
analyzer="word",
|
| 1715 |
+
ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
|
| 1716 |
+
max_features=vec_state.get("max_features"),
|
| 1717 |
+
min_df=vec_state.get("min_df"),
|
| 1718 |
+
max_df=vec_state.get("max_df"),
|
| 1719 |
+
token_pattern=TOKEN_PATTERN,
|
| 1720 |
+
lowercase=True,
|
| 1721 |
+
stop_words=STOPWORD_FOR_VEC,
|
| 1722 |
+
dtype=np.float32,
|
| 1723 |
+
)
|
| 1724 |
+
tf = cv.fit_transform(corpus_texts)
|
| 1725 |
+
bm25 = BM25Transformer().fit(tf)
|
| 1726 |
+
q_word = bm25.transform(cv.transform([q]))
|
| 1727 |
+
|
| 1728 |
+
char_vec = CharTfidf(
|
| 1729 |
+
analyzer="char", ngram_range=(3, 5), min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32
|
| 1730 |
+
).fit(corpus_texts)
|
| 1731 |
+
q_char = char_vec.transform([q])
|
| 1732 |
+
|
| 1733 |
+
return hstack([q_word, q_char * 0.20], format="csr")
|
| 1734 |
+
|
| 1735 |
# -------- Main pipeline --------
|
| 1736 |
def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 1737 |
use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
|
| 1738 |
trusted_domains_in, extra_keywords_in, highlight_toggle,
|
|
|
|
| 1739 |
use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
|
| 1740 |
+
per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
|
| 1741 |
+
watchlist_in, min_mentions):
|
| 1742 |
if inbox_file is None:
|
| 1743 |
+
return (
|
| 1744 |
+
"**Please upload a file.**",
|
| 1745 |
+
None, None, None, None, None,
|
| 1746 |
+
None, None, # surveillance outputs
|
| 1747 |
+
None, # results_df
|
| 1748 |
+
None, None, None, # states df/vec/X
|
| 1749 |
+
None, None, # index & term names
|
| 1750 |
+
None, None, # flags
|
| 1751 |
+
gr.update(), gr.update(), gr.update(), gr.update(), # dropdowns
|
| 1752 |
+
None, None, None, # svd/norm/dims
|
| 1753 |
+
None, None, # extra terms / highlight
|
| 1754 |
+
gr.update() # bucket list
|
| 1755 |
+
)
|
| 1756 |
|
| 1757 |
+
# === Inner helpers for this function ===
|
| 1758 |
def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
|
| 1759 |
texts = list(df_in.apply(enrich_text, axis=1))
|
| 1760 |
subjects_only = list(df_in["subject"].fillna(""))
|
|
|
|
| 1853 |
|
| 1854 |
def _cluster_space(
|
| 1855 |
X_space,
|
| 1856 |
+
bucket_name: str,
|
| 1857 |
df_part: pd.DataFrame,
|
| 1858 |
use_lsa: bool,
|
| 1859 |
use_hdbscan: bool,
|
|
|
|
| 1869 |
):
|
| 1870 |
n = X_space.shape[0]
|
| 1871 |
|
| 1872 |
+
# Per-bucket stabilizer params
|
| 1873 |
+
min_size, merge_th, reassign_th = _bucket_stabilizer_params(bucket_name)
|
| 1874 |
+
|
| 1875 |
if n <= 1:
|
| 1876 |
labels = np.zeros((n,), dtype=int) if n == 1 else np.array([], dtype=int)
|
| 1877 |
centers = None
|
| 1878 |
chosen_k = int(n) if n > 0 else 0
|
| 1879 |
+
return stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th), centers, chosen_k
|
| 1880 |
+
|
| 1881 |
if n < 10:
|
| 1882 |
k_small = min(max(2, n // 2), n)
|
| 1883 |
kmeans = MiniBatchKMeans(
|
|
|
|
| 1888 |
)
|
| 1889 |
labels = kmeans.fit_predict(X_space)
|
| 1890 |
centers = getattr(kmeans, "cluster_centers_", None)
|
| 1891 |
+
labels = stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th)
|
| 1892 |
return labels, centers, int(len(set(labels)))
|
| 1893 |
|
| 1894 |
if use_hdbscan and HDBSCAN_OK and isinstance(X_space, np.ndarray) and X_space.shape[0] >= max(50, hdb_min_cluster):
|
|
|
|
| 1902 |
)
|
| 1903 |
labels = clusterer.fit_predict(X_space)
|
| 1904 |
centers = None
|
| 1905 |
+
labels = stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th)
|
| 1906 |
chosen_k = int(len(set([l for l in labels if l >= 0])))
|
| 1907 |
return labels, centers, chosen_k
|
| 1908 |
|
| 1909 |
+
# Choose k (global rule or kneedle), then per-bucket multiplier
|
| 1910 |
if bool(auto_k):
|
| 1911 |
if use_lsa and isinstance(X_space, np.ndarray):
|
| 1912 |
k, _ = choose_k_by_kneedle(X_space, ks=(50, 100, 150, 200, 300, 400, 500))
|
|
|
|
| 1914 |
k = auto_k_rule(X_space.shape[0])
|
| 1915 |
else:
|
| 1916 |
k = max(10, int(k_clusters or 350))
|
| 1917 |
+
k = int(max(2, round(k * _bucket_k_multiplier(bucket_name))))
|
| 1918 |
|
| 1919 |
init = None
|
| 1920 |
if use_lsa and isinstance(X_space, np.ndarray) and count_vec is not None:
|
|
|
|
| 1936 |
centers = kmeans.cluster_centers_ if hasattr(kmeans, "cluster_centers_") else None
|
| 1937 |
if use_lsa and centers is not None:
|
| 1938 |
labels = merge_close_clusters(labels, centers, thresh=0.95)
|
| 1939 |
+
labels = stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th)
|
| 1940 |
chosen_k = int(len(set(labels)))
|
| 1941 |
return labels, centers, chosen_k
|
| 1942 |
|
| 1943 |
+
# ---- Begin processing ----
|
| 1944 |
trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
|
| 1945 |
extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
|
| 1946 |
extra_terms_lower = [t.lower() for t in extra_terms]
|
|
|
|
| 1948 |
recs = _load_json_records(inbox_file.name)
|
| 1949 |
if not recs:
|
| 1950 |
return ("**No valid records found.**",
|
| 1951 |
+
None, None, None, None, None,
|
| 1952 |
+
None, None,
|
| 1953 |
+
None,
|
| 1954 |
+
None, None, None,
|
| 1955 |
+
None, None,
|
| 1956 |
+
None, None, None, None,
|
| 1957 |
+
None, None, None,
|
| 1958 |
+
None, None,
|
| 1959 |
+
None)
|
| 1960 |
|
| 1961 |
normd = []
|
| 1962 |
for r in tqdm(recs, desc="Normalize", leave=False):
|
|
|
|
| 1966 |
df = pd.DataFrame(normd)
|
| 1967 |
if df.empty:
|
| 1968 |
return ("**No usable email records.**",
|
| 1969 |
+
None, None, None, None, None,
|
| 1970 |
+
None, None,
|
| 1971 |
+
None,
|
| 1972 |
+
None, None, None,
|
| 1973 |
+
None, None,
|
| 1974 |
+
None, None, None, None,
|
| 1975 |
+
None, None, None,
|
| 1976 |
+
None, None,
|
| 1977 |
+
None)
|
| 1978 |
|
| 1979 |
df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
|
| 1980 |
df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
|
| 1981 |
df = compute_sentiment_column(df)
|
| 1982 |
|
| 1983 |
+
# Stage-1 routing (bucketing)
|
| 1984 |
df["bucket"] = df.apply(route_email_row, axis=1)
|
| 1985 |
df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_domain", "")), axis=1)
|
| 1986 |
df["is_notify"] = df.apply(lambda r: is_notification_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_email", ""), r.get("from_domain", "")), axis=1)
|
| 1987 |
df.loc[df["is_news"] == True, "bucket"] = "Newsletters/Alerts"
|
| 1988 |
df.loc[df["is_notify"] == True, "bucket"] = "IT/Security"
|
| 1989 |
|
| 1990 |
+
# Flags
|
| 1991 |
flags = []
|
| 1992 |
for _, row in df.iterrows():
|
| 1993 |
f = []
|
|
|
|
| 2001 |
flags.append(f)
|
| 2002 |
df["flags"] = flags
|
| 2003 |
|
| 2004 |
+
# Split out stable buckets
|
| 2005 |
df_main = df[~df["bucket"].isin(["Newsletters/Alerts", "IT/Security"])].reset_index(drop=True)
|
| 2006 |
df_news = df[df["bucket"] == "Newsletters/Alerts"].reset_index(drop=True)
|
| 2007 |
df_alerts = df[df["bucket"] == "IT/Security"].reset_index(drop=True)
|
| 2008 |
|
| 2009 |
+
# Optional embeddings
|
| 2010 |
kv = None
|
| 2011 |
emb_dim = 0
|
| 2012 |
if bool(use_embeddings):
|
| 2013 |
kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
|
| 2014 |
|
| 2015 |
+
# Build partitions: per language within bucket if requested
|
| 2016 |
parts = []
|
| 2017 |
if bool(per_language):
|
| 2018 |
for bkt, g_bucket in df_main.groupby("bucket", dropna=False):
|
|
|
|
| 2023 |
parts.append(((bkt, "all"), grp.copy()))
|
| 2024 |
|
| 2025 |
labels_list, cluster_name_list, anomaly_list = [], [], []
|
| 2026 |
+
bucket_indexers = []
|
| 2027 |
X_reduced_holder = None
|
| 2028 |
term_names_global = {}
|
| 2029 |
single_partition = (len(parts) == 1)
|
| 2030 |
+
d_word_agg, d_char_agg = 0, 0
|
| 2031 |
svd_obj_local, norm_obj_local = None, None
|
| 2032 |
|
| 2033 |
for (bucket_name, _lang), df_part in parts:
|
|
|
|
| 2063 |
except Exception:
|
| 2064 |
pass
|
| 2065 |
|
| 2066 |
+
labels, _, _ = _cluster_space(
|
| 2067 |
X_space=X_space,
|
| 2068 |
+
bucket_name=bucket_name,
|
| 2069 |
df_part=df_part,
|
| 2070 |
use_lsa=bool(use_lsa) and X_reduced is not None,
|
| 2071 |
use_hdbscan=bool(use_hdbscan),
|
|
|
|
| 2080 |
d_word=d_word,
|
| 2081 |
d_char=d_char,
|
| 2082 |
)
|
|
|
|
|
|
|
| 2083 |
|
| 2084 |
term_names = cluster_labels_pmi_bigram(
|
| 2085 |
texts=texts, labels=labels, subjects=subjects_only,
|
| 2086 |
topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
|
| 2087 |
)
|
| 2088 |
|
|
|
|
| 2089 |
bucket_indexers.append(df_part.index)
|
| 2090 |
labels_list.append(pd.Series(labels, index=df_part.index))
|
| 2091 |
cluster_name_list.append(pd.Series([term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels], index=df_part.index))
|
| 2092 |
anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
|
|
|
|
| 2093 |
term_names_global.update({int(k): v for k, v in term_names.items()})
|
| 2094 |
|
| 2095 |
if single_partition and X_reduced is not None:
|
| 2096 |
X_reduced_holder = X_reduced
|
| 2097 |
+
|
| 2098 |
if labels_list:
|
| 2099 |
df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_id"] = pd.concat(labels_list).sort_index()
|
| 2100 |
df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_name"] = pd.concat(cluster_name_list).sort_index()
|
|
|
|
| 2104 |
df_main["cluster_name"] = "unclustered"
|
| 2105 |
df_main["anomaly_score"] = np.nan
|
| 2106 |
|
| 2107 |
+
# Assign fixed ids for news/alerts buckets
|
| 2108 |
if len(df_news):
|
| 2109 |
df_news.loc[:, "cluster_id"] = -1
|
| 2110 |
df_news.loc[:, "cluster_name"] = "newsletter/news"
|
|
|
|
| 2114 |
df_alerts.loc[:, "cluster_name"] = "system/alerts"
|
| 2115 |
df_alerts.loc[:, "anomaly_score"] = np.nan
|
| 2116 |
|
| 2117 |
+
# Merge back
|
| 2118 |
df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
|
| 2119 |
+
|
| 2120 |
+
# Label de-dup pass per-bucket
|
| 2121 |
+
df = dedupe_all_labels(df)
|
| 2122 |
+
|
| 2123 |
+
# Scores
|
| 2124 |
df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
|
| 2125 |
df = compute_context_anomaly(df)
|
| 2126 |
|
| 2127 |
+
# Surveillance campaigns
|
| 2128 |
+
wl = [w.strip() for w in (watchlist_in or "").split(",") if w.strip()]
|
| 2129 |
+
ent_df, samp_df = detect_surveillance_campaigns(df, watchlist=wl, min_mentions=int(min_mentions or 15))
|
| 2130 |
+
df = tag_surveillance_emails(df, ent_df, threshold=4.5)
|
| 2131 |
+
|
| 2132 |
+
# Build indexes/search
|
| 2133 |
index_obj = None
|
| 2134 |
use_faiss_flag = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
|
| 2135 |
if use_faiss_flag:
|
|
|
|
| 2146 |
except Exception:
|
| 2147 |
pass
|
| 2148 |
|
| 2149 |
+
# Summaries
|
| 2150 |
cluster_counts = (
|
| 2151 |
df.groupby(["bucket", "cluster_id", "cluster_name"])
|
| 2152 |
.size()
|
|
|
|
| 2193 |
norm_obj_out = norm_obj_local if single_partition else None
|
| 2194 |
|
| 2195 |
return (
|
| 2196 |
+
status_md, # status
|
| 2197 |
+
cluster_counts, domain_counts, sender_counts, # summaries
|
| 2198 |
+
actors, offhours_table, # extra summaries
|
| 2199 |
+
ent_df, samp_df, # surveillance tables
|
| 2200 |
+
out_table, # results table
|
| 2201 |
+
df, vec_state, X_reduced_holder, # states
|
| 2202 |
+
index_obj, term_names_global, # index + labels
|
| 2203 |
+
bool(use_lsa), use_faiss_flag, # flags
|
| 2204 |
gr.update(choices=cluster_choices, value="(any)"),
|
| 2205 |
gr.update(choices=domain_choices, value="(any)"),
|
| 2206 |
gr.update(choices=sender_choices, value="(any)"),
|
|
|
|
| 2210 |
gr.update(choices=bucket_choices, value="(any)")
|
| 2211 |
)
|
| 2212 |
|
| 2213 |
+
# Bind Process button
|
| 2214 |
(run_btn.click)(
|
| 2215 |
process_file,
|
| 2216 |
inputs=[
|
|
|
|
| 2219 |
trusted_domains_in, extra_keywords_in, highlight_toggle,
|
| 2220 |
use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
|
| 2221 |
per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
|
| 2222 |
+
watchlist_in, min_mentions
|
| 2223 |
],
|
| 2224 |
outputs=[
|
| 2225 |
status, cluster_counts_df, domain_counts_df, sender_counts_df,
|
| 2226 |
+
actors_df, offhours_df,
|
| 2227 |
+
surv_entities_df, surv_samples_df,
|
| 2228 |
+
results_df,
|
| 2229 |
+
state_df, state_vec, state_X_reduced,
|
| 2230 |
+
state_index, state_term_names,
|
| 2231 |
state_use_lsa, state_use_faiss,
|
| 2232 |
cluster_drop, domain_drop, sender_drop, lang_drop,
|
| 2233 |
+
state_svd, state_norm, state_dims,
|
| 2234 |
+
state_extra_terms, state_highlight,
|
| 2235 |
bucket_drop,
|
| 2236 |
],
|
| 2237 |
)
|
| 2238 |
|
| 2239 |
# -------- Filtering & Search --------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2240 |
def refresh_results(df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
|
| 2241 |
if df is None or len(df) == 0:
|
| 2242 |
return pd.DataFrame()
|
|
|
|
| 2245 |
)
|
| 2246 |
return _sort_results(filt, sort_by, sort_dir)
|
| 2247 |
|
| 2248 |
+
# Re-run when any filter control changes
|
| 2249 |
for ctrl in [bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
|
| 2250 |
date_start, date_end, sort_by, sort_dir, hide_noise]:
|
| 2251 |
ctrl.change(
|
|
|
|
| 2272 |
outputs=[results_df],
|
| 2273 |
)
|
| 2274 |
|
| 2275 |
+
# --- Search ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2276 |
def search_fn(q, df, vec, X_red, index, use_lsa, use_faiss, svd, norm, sort, sdir):
|
| 2277 |
if not q or df is None or vec is None or index is None:
|
| 2278 |
return pd.DataFrame(), []
|
|
|
|
| 2295 |
return pd.DataFrame(), q_terms
|
| 2296 |
|
| 2297 |
if isinstance(index, NearestNeighbors):
|
|
|
|
| 2298 |
if hasattr(index, "n_samples_fit_") and index.n_samples_fit_ <= 1:
|
| 2299 |
return pd.DataFrame(), q_terms
|
| 2300 |
dists, inds = index.kneighbors(q_emb, n_neighbors=n_req)
|
|
|
|
| 2319 |
outputs=[results_df, state_query_terms],
|
| 2320 |
)
|
| 2321 |
|
| 2322 |
+
# --- Reader selection (highlighting) ---
|
| 2323 |
def on_row_select(evt: gr.SelectData, table, df, term_names, q_terms, extra_terms, do_highlight):
|
| 2324 |
if evt.index is None or table is None or len(table) == 0 or df is None or len(df) == 0:
|
| 2325 |
return ""
|
|
|
|
| 2354 |
outputs=[email_view],
|
| 2355 |
)
|
| 2356 |
|
| 2357 |
+
# --- Click-to-filter helpers ---
|
| 2358 |
def on_click_filter(evt: gr.SelectData, df_sum: pd.DataFrame, col_name: str, out_comp: gr.Dropdown):
|
| 2359 |
if evt.index is None or df_sum is None or df_sum.empty:
|
| 2360 |
return gr.update()
|
|
|
|
| 2367 |
r = df_sum.iloc[evt.index[0]]
|
| 2368 |
return gr.update(value=r["bucket"]), gr.update(value=r["label"])
|
| 2369 |
|
|
|
|
| 2370 |
cluster_counts_df.select(
|
| 2371 |
on_cluster_summary_select, [cluster_counts_df], [bucket_drop, cluster_drop]
|
| 2372 |
).then(
|
|
|
|
| 2378 |
outputs=[results_df],
|
| 2379 |
)
|
| 2380 |
|
|
|
|
| 2381 |
domain_counts_df.select(
|
| 2382 |
lambda evt, df: on_click_filter(evt, df, "from_domain", domain_drop), [domain_counts_df], [domain_drop]
|
| 2383 |
).then(
|
|
|
|
| 2389 |
outputs=[results_df],
|
| 2390 |
)
|
| 2391 |
|
|
|
|
| 2392 |
sender_counts_df.select(
|
| 2393 |
lambda evt, df: on_click_filter(evt, df, "from_email", sender_drop), [sender_counts_df], [sender_drop]
|
| 2394 |
).then(
|
|
|
|
| 2402 |
|
| 2403 |
if __name__ == "__main__":
|
| 2404 |
# Disable SSR to avoid handler arity warnings under server-side rendering
|
| 2405 |
+
demo.launch(ssr_mode=False)
|
| 2406 |
+
|