Update app.py
Browse files
app.py
CHANGED
|
@@ -227,17 +227,28 @@ YEAR_RE = re.compile(r"^(19|20)\d{2}$")
|
|
| 227 |
NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
|
| 228 |
ONE_CHAR_RE = re.compile(r"^.$")
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
# This stoplist is used by the CountVectorizer (MUST be list for sklearn)
|
| 231 |
STOPWORD_FOR_VEC = sorted(EN_STOP | HE_STOP | STOP_TERMS)
|
| 232 |
|
| 233 |
def _is_junk_term(t: str) -> bool:
|
| 234 |
-
tl = t.lower()
|
| 235 |
-
if
|
| 236 |
-
|
| 237 |
if EMAIL_LIKE_RE.search(tl): return True
|
| 238 |
if YEAR_RE.match(tl): return True
|
| 239 |
if NUMERIC_RE.match(tl): return True
|
| 240 |
if ONE_CHAR_RE.match(tl): return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
return False
|
| 242 |
|
| 243 |
def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
|
|
@@ -686,13 +697,22 @@ def enrich_text(row: pd.Series) -> str:
|
|
| 686 |
tokens.append(lang_tok)
|
| 687 |
return (t + " " + " ".join(tokens)).strip()
|
| 688 |
|
| 689 |
-
# =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST ===================
|
| 690 |
-
def cluster_labels_pmi_bigram(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 691 |
"""
|
| 692 |
Improved labeler:
|
| 693 |
- Considers bigrams AND trigrams (PMI vs. global)
|
| 694 |
- Class-TFIDF unigrams with subject coverage boost
|
| 695 |
- Suppresses globally ubiquitous tokens/phrases (appear in >20% docs by default)
|
|
|
|
| 696 |
"""
|
| 697 |
import math as _math
|
| 698 |
from collections import Counter, defaultdict
|
|
@@ -704,12 +724,13 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
|
|
| 704 |
def is_junk_token(tok: str) -> bool:
|
| 705 |
if _is_junk_term(tok): return True
|
| 706 |
tl = tok.lower()
|
| 707 |
-
if tl.startswith("__"): return True
|
| 708 |
-
if tl in STOP_TERMS: return True
|
| 709 |
-
if tl in HEADER_STOP: return True
|
| 710 |
if "@" in tl: return True
|
| 711 |
if tl.isascii() and len(tl) <= 2: return True
|
| 712 |
-
if
|
|
|
|
|
|
|
|
|
|
| 713 |
return False
|
| 714 |
|
| 715 |
def tokenize_clean(t):
|
|
@@ -769,7 +790,7 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
|
|
| 769 |
for c in sorted(set(int(x) for x in labels)):
|
| 770 |
n_docs_c = max(1, per_c_doc_count[c])
|
| 771 |
|
| 772 |
-
# PMI bigrams & trigrams with subject-coverage boost
|
| 773 |
phrases = []
|
| 774 |
for store, glob_df, subj_docs, n in (
|
| 775 |
(per_c_bg[c], glob_df_bg, per_c_subj_bg_docs[c], 2),
|
|
@@ -788,14 +809,17 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
|
|
| 788 |
cov = 0.0
|
| 789 |
if have_subjects:
|
| 790 |
cov = subj_docs[ng] / n_docs_c
|
|
|
|
|
|
|
|
|
|
| 791 |
score += subject_alpha * cov
|
| 792 |
-
scored.append((score, ng))
|
| 793 |
-
|
| 794 |
-
|
| 795 |
take = max(1, topn // (3 if n == 3 else 2))
|
| 796 |
-
phrases.extend([p for _, p in scored[:take]])
|
| 797 |
|
| 798 |
-
# Class-TFIDF unigrams with subject coverage boost
|
| 799 |
docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
|
| 800 |
docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
|
| 801 |
corpus = [docs_c[0], docs_bg[0]]
|
|
@@ -807,16 +831,24 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
|
|
| 807 |
vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
|
| 808 |
row = X[0].toarray().ravel()
|
| 809 |
|
| 810 |
-
# Subject coverage vector
|
| 811 |
subj_cov = np.zeros_like(row)
|
|
|
|
|
|
|
| 812 |
if have_subjects:
|
| 813 |
-
vocab_index = {t:i for i,t in enumerate(vocab)}
|
| 814 |
for tok, cnt_docs in per_c_subj_uni_docs[c].items():
|
| 815 |
if tok in vocab_index and not is_junk_token(tok):
|
| 816 |
-
|
|
|
|
|
|
|
|
|
|
| 817 |
|
|
|
|
| 818 |
row_boosted = row + subject_alpha * subj_cov
|
| 819 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 820 |
unis = []
|
| 821 |
for i in order:
|
| 822 |
tok = vocab[i]
|
|
@@ -924,13 +956,95 @@ def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVect
|
|
| 924 |
return seeds_red
|
| 925 |
return None
|
| 926 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 927 |
# =================== Scoring & Flags ===================
|
| 928 |
def _hour_of(dt_iso: str) -> Optional[int]:
|
| 929 |
try:
|
| 930 |
if not dt_iso: return None
|
| 931 |
dt = pd.to_datetime(dt_iso, utc=True, errors="coerce")
|
| 932 |
if pd.isna(dt): return None
|
| 933 |
-
# treat UTC for lack of per-user tz; still useful as "odd hour"
|
| 934 |
return int(dt.hour)
|
| 935 |
except Exception:
|
| 936 |
return None
|
|
@@ -960,11 +1074,9 @@ def corruption_score(row, trusted_domains: set):
|
|
| 960 |
body_len = len(row.get("body_text",""))
|
| 961 |
if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
|
| 962 |
score += 0.5
|
| 963 |
-
# personal/off-channel via headers
|
| 964 |
fd = (row.get("from_domain") or "").lower()
|
| 965 |
if fd in PERSONAL_DOMAINS and fd not in trusted_domains:
|
| 966 |
score += 0.5
|
| 967 |
-
# odd hours
|
| 968 |
h = _hour_of(row.get("date") or "")
|
| 969 |
if h is not None and (h < 6 or h > 22):
|
| 970 |
score += 0.3
|
|
@@ -1044,6 +1156,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1044 |
date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
|
| 1045 |
sort_by = gr.Dropdown(label="Sort by", choices=["corruption_score","date","anomaly_score","search_score"], value="corruption_score")
|
| 1046 |
sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
|
|
|
|
|
|
|
| 1047 |
|
| 1048 |
with gr.Row():
|
| 1049 |
run_btn = gr.Button("Process", variant="primary")
|
|
@@ -1121,6 +1235,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1121 |
tag_value: str,
|
| 1122 |
start: str,
|
| 1123 |
end: str,
|
|
|
|
| 1124 |
) -> pd.DataFrame:
|
| 1125 |
out = df
|
| 1126 |
if cluster and cluster != "(any)":
|
|
@@ -1151,11 +1266,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1151 |
out = out[pd.to_datetime(out["date"], utc=True, errors="coerce") <= dt]
|
| 1152 |
except Exception:
|
| 1153 |
pass
|
|
|
|
|
|
|
| 1154 |
return out
|
| 1155 |
|
| 1156 |
# -------- Simple social network stats --------
|
| 1157 |
def social_stats(df: pd.DataFrame) -> pd.DataFrame:
|
| 1158 |
-
# degree = unique counterparts per address (from <-> each to/cc)
|
| 1159 |
deg = {}
|
| 1160 |
def add_edge(a,b):
|
| 1161 |
if not a or not b or a==b: return
|
|
@@ -1186,7 +1302,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1186 |
|
| 1187 |
# === Vectorization & Clustering (UPGRADED) ===
|
| 1188 |
def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
|
| 1189 |
-
# “texts” feed vectorizers; “subjects_only” helps labels
|
| 1190 |
texts = list(df_in.apply(enrich_text, axis=1))
|
| 1191 |
subjects_only = list(df_in["subject"].fillna(""))
|
| 1192 |
return texts, subjects_only
|
|
@@ -1228,7 +1343,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1228 |
count_vec = None; bm25 = None
|
| 1229 |
return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full
|
| 1230 |
|
| 1231 |
-
# Original Count -> BM25 + char
|
| 1232 |
count_vec = CountVectorizer(
|
| 1233 |
analyzer="word", ngram_range=ngram_range,
|
| 1234 |
max_features=int(max_features) if max_features else None,
|
|
@@ -1258,12 +1372,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1258 |
|
| 1259 |
n_docs = X_full.shape[0]
|
| 1260 |
n_feats = X_full.shape[1]
|
| 1261 |
-
# valid bound for TruncatedSVD: 1 <= n_components < min(n_docs, n_feats)
|
| 1262 |
max_components = max(1, min(n_docs, n_feats) - 1)
|
| 1263 |
n_comp = int(min(int(lsa_dim or 256), max_components))
|
| 1264 |
|
| 1265 |
if n_comp < 2:
|
| 1266 |
-
# too small to reduce meaningfully — skip LSA
|
| 1267 |
return None, None, None
|
| 1268 |
|
| 1269 |
svd_obj = TruncatedSVD(n_components=n_comp, random_state=0)
|
|
@@ -1274,9 +1386,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1274 |
return X_reduced, svd_obj, norm_obj
|
| 1275 |
|
| 1276 |
def _attach_embeddings(texts, X_reduced_or_full, use_lsa, kv, emb_dim, weight):
|
| 1277 |
-
"""
|
| 1278 |
-
Concatenate averaged word vectors (dense) into the current space.
|
| 1279 |
-
"""
|
| 1280 |
if kv is None or emb_dim <= 0 or weight <= 0.0:
|
| 1281 |
return X_reduced_or_full, emb_dim
|
| 1282 |
doc_embs = _build_doc_embeddings(texts, kv, emb_dim).astype(np.float32)
|
|
@@ -1303,19 +1412,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1303 |
norm_obj,
|
| 1304 |
d_word, d_char
|
| 1305 |
):
|
| 1306 |
-
"""
|
| 1307 |
-
Run HDBSCAN (if requested and available) or MiniBatchKMeans.
|
| 1308 |
-
"""
|
| 1309 |
n = X_space.shape[0]
|
| 1310 |
|
| 1311 |
-
# NEW: trivial/tiny partition handling
|
| 1312 |
if n <= 1:
|
| 1313 |
labels = np.zeros((n,), dtype=int) if n == 1 else np.array([], dtype=int)
|
| 1314 |
centers = None
|
| 1315 |
chosen_k = int(n) if n > 0 else 0
|
| 1316 |
return labels, centers, chosen_k
|
| 1317 |
if n < 10:
|
| 1318 |
-
# avoid unstable large-k on tiny sets
|
| 1319 |
k_small = min(max(2, n // 2), n)
|
| 1320 |
kmeans = MiniBatchKMeans(
|
| 1321 |
n_clusters=int(k_small),
|
|
@@ -1341,7 +1445,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1341 |
chosen_k = int(len(set([l for l in labels if l >= 0])))
|
| 1342 |
return labels, centers, chosen_k
|
| 1343 |
|
| 1344 |
-
# Otherwise MiniBatchKMeans
|
| 1345 |
if bool(auto_k):
|
| 1346 |
if use_lsa and isinstance(X_space, np.ndarray):
|
| 1347 |
k, _ = choose_k_by_kneedle(X_space, ks=(50, 100, 150, 200, 300, 400, 500))
|
|
@@ -1373,7 +1476,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1373 |
chosen_k = int(len(set(labels)))
|
| 1374 |
return labels, centers, chosen_k
|
| 1375 |
|
| 1376 |
-
# Main logic starts
|
| 1377 |
trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
|
| 1378 |
extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
|
| 1379 |
extra_terms_lower = [t.lower() for t in extra_terms]
|
|
@@ -1424,7 +1526,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1424 |
if bool(use_embeddings):
|
| 1425 |
kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
|
| 1426 |
|
| 1427 |
-
# >>> FIX: keep original indices when splitting (no reset_index here)
|
| 1428 |
parts = []
|
| 1429 |
if bool(per_language) and "lang" in df_main.columns:
|
| 1430 |
for lang_code, grp in df_main.groupby("lang", dropna=False):
|
|
@@ -1488,18 +1589,26 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1488 |
d_word=d_word,
|
| 1489 |
d_char=d_char,
|
| 1490 |
)
|
| 1491 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1492 |
|
| 1493 |
term_names = cluster_labels_pmi_bigram(
|
| 1494 |
-
texts=texts, labels=labels, subjects=subjects_only,
|
|
|
|
| 1495 |
)
|
| 1496 |
term_names_global.update({int(k): v for k, v in term_names.items()})
|
| 1497 |
|
| 1498 |
-
# >>> We kept original indices in df_part, so Series align to df_main on concat
|
| 1499 |
labels_list.append(pd.Series(labels, index=df_part.index))
|
| 1500 |
cluster_name_list.append(
|
| 1501 |
pd.Series(
|
| 1502 |
-
[term_names.get(int(c),
|
| 1503 |
index=df_part.index,
|
| 1504 |
)
|
| 1505 |
)
|
|
@@ -1517,7 +1626,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1517 |
df_main["cluster_name"] = "unclustered"
|
| 1518 |
df_main["anomaly_score"] = np.nan
|
| 1519 |
|
| 1520 |
-
# >>> Use .loc to avoid chained-assignment warnings
|
| 1521 |
if len(df_news):
|
| 1522 |
df_news.loc[:, "cluster_id"] = -1
|
| 1523 |
df_news.loc[:, "cluster_name"] = "newsletter/news"
|
|
@@ -1546,7 +1654,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1546 |
except Exception:
|
| 1547 |
pass
|
| 1548 |
|
| 1549 |
-
# Summaries and UI updates...
|
| 1550 |
cluster_counts = (
|
| 1551 |
df.groupby(["cluster_id", "cluster_name"])
|
| 1552 |
.size()
|
|
@@ -1585,7 +1692,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1585 |
"max_df": float(max_df),
|
| 1586 |
"use_bigrams": bool(use_bigrams),
|
| 1587 |
}
|
| 1588 |
-
status_md = f"**Processed {len(df):,} emails** |
|
| 1589 |
|
| 1590 |
svd_obj_out = svd_obj_local if single_partition else None
|
| 1591 |
norm_obj_out = norm_obj_local if single_partition else None
|
|
@@ -1710,13 +1817,17 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1710 |
|
| 1711 |
return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
|
| 1712 |
|
| 1713 |
-
def refresh_results(df, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir):
|
| 1714 |
if df is None or len(df) == 0:
|
| 1715 |
return pd.DataFrame()
|
| 1716 |
-
filt = _apply_filters(
|
|
|
|
|
|
|
| 1717 |
return _sort_results(filt, sort_by, sort_dir)
|
| 1718 |
|
| 1719 |
-
|
|
|
|
|
|
|
| 1720 |
ctrl.change(
|
| 1721 |
refresh_results,
|
| 1722 |
inputs=[
|
|
@@ -1731,14 +1842,17 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1731 |
date_end,
|
| 1732 |
sort_by,
|
| 1733 |
sort_dir,
|
|
|
|
| 1734 |
],
|
| 1735 |
outputs=[results_df],
|
| 1736 |
)
|
| 1737 |
|
|
|
|
| 1738 |
reset_btn.click(
|
| 1739 |
-
lambda: ["(any)"] * 6 + [""] * 2 + ["corruption_score", "desc"],
|
| 1740 |
[],
|
| 1741 |
-
[cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
|
|
|
|
| 1742 |
).then(
|
| 1743 |
refresh_results,
|
| 1744 |
inputs=[
|
|
@@ -1753,21 +1867,25 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1753 |
date_end,
|
| 1754 |
sort_by,
|
| 1755 |
sort_dir,
|
|
|
|
| 1756 |
],
|
| 1757 |
outputs=[results_df],
|
| 1758 |
)
|
| 1759 |
|
|
|
|
| 1760 |
def _tokenize_query(q: str) -> List[str]:
|
| 1761 |
-
return [p.strip() for p in re.split(r"\s+", q) if p.strip()][:8]
|
| 1762 |
|
| 1763 |
def _project_query_to_lsa(q_vec, svd, norm) -> Optional[np.ndarray]:
|
| 1764 |
try:
|
| 1765 |
return norm.transform(svd.transform(q_vec)).astype(np.float32)
|
| 1766 |
-
except:
|
| 1767 |
return None
|
| 1768 |
|
| 1769 |
def _vectorize_query(q, vec_state, corpus_texts):
|
|
|
|
| 1770 |
char_min_df = 1 if len(corpus_texts) <= 1 else 2
|
|
|
|
| 1771 |
if vec_state.get("use_hashing"):
|
| 1772 |
hv = HashingVectorizer(
|
| 1773 |
analyzer="word",
|
|
@@ -1775,7 +1893,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1775 |
n_features=2 ** vec_state.get("hash_bits", 18),
|
| 1776 |
token_pattern=TOKEN_PATTERN,
|
| 1777 |
lowercase=True,
|
|
|
|
|
|
|
| 1778 |
)
|
|
|
|
| 1779 |
counts = hv.transform(corpus_texts)
|
| 1780 |
tfidf_tr = TfidfTransformer().fit(counts)
|
| 1781 |
q_word = tfidf_tr.transform(hv.transform([q]))
|
|
@@ -1789,21 +1910,24 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1789 |
token_pattern=TOKEN_PATTERN,
|
| 1790 |
lowercase=True,
|
| 1791 |
stop_words=STOPWORD_FOR_VEC,
|
|
|
|
| 1792 |
)
|
| 1793 |
tf = cv.fit_transform(corpus_texts)
|
| 1794 |
bm25 = BM25Transformer().fit(tf)
|
| 1795 |
q_word = bm25.transform(cv.transform([q]))
|
| 1796 |
|
| 1797 |
char_vec = CharTfidf(
|
| 1798 |
-
analyzer="char", ngram_range=(3, 5), min_df=char_min_df, max_features=100_000, lowercase=True
|
| 1799 |
).fit(corpus_texts)
|
| 1800 |
q_char = char_vec.transform([q])
|
|
|
|
| 1801 |
return hstack([q_word, q_char * 0.20], format="csr")
|
| 1802 |
|
| 1803 |
def search_fn(q, df, vec, X_red, index, use_lsa, use_faiss, svd, norm, sort, sdir):
|
| 1804 |
if not q or df is None or vec is None or index is None:
|
| 1805 |
return pd.DataFrame(), []
|
| 1806 |
|
|
|
|
| 1807 |
mask = ~df["cluster_id"].isin([-1, -2, -3])
|
| 1808 |
df_main = df[mask].reset_index(drop=True)
|
| 1809 |
if df_main.empty:
|
|
@@ -1812,7 +1936,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1812 |
q_terms = _tokenize_query(q)
|
| 1813 |
q_vec = _vectorize_query(q, vec, list(df_main.apply(enrich_text, axis=1)))
|
| 1814 |
|
| 1815 |
-
q_emb = _project_query_to_lsa(q_vec, svd, norm) if use_lsa and svd and norm else q_vec
|
| 1816 |
if q_emb is None:
|
| 1817 |
return pd.DataFrame(), q_terms
|
| 1818 |
|
|
@@ -1821,13 +1945,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1821 |
return pd.DataFrame(), q_terms
|
| 1822 |
|
| 1823 |
if isinstance(index, NearestNeighbors):
|
|
|
|
| 1824 |
if hasattr(index, "n_samples_fit_") and index.n_samples_fit_ <= 1:
|
| 1825 |
return pd.DataFrame(), q_terms
|
| 1826 |
dists, inds = index.kneighbors(q_emb, n_neighbors=n_req)
|
| 1827 |
sims = 1.0 - dists[0]
|
| 1828 |
results = df_main.iloc[inds[0]].copy()
|
| 1829 |
results["search_score"] = sims
|
| 1830 |
-
elif use_faiss and FAISS_OK and
|
| 1831 |
D, I = index.search(q_emb.astype(np.float32), k=n_req)
|
| 1832 |
results = df_main.iloc[I[0]].copy()
|
| 1833 |
results["search_score"] = D[0]
|
|
@@ -1854,12 +1979,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1854 |
outputs=[results_df, state_query_terms],
|
| 1855 |
)
|
| 1856 |
|
|
|
|
| 1857 |
def on_row_select(evt: gr.SelectData, table, df, term_names, q_terms, extra_terms, do_highlight):
|
| 1858 |
if evt.index is None or table is None or len(table) == 0 or df is None or len(df) == 0:
|
| 1859 |
return ""
|
| 1860 |
row_idx = evt.index[0]
|
| 1861 |
sel = table.iloc[row_idx]
|
| 1862 |
|
|
|
|
| 1863 |
cand = df[
|
| 1864 |
(df["subject"] == sel.get("subject"))
|
| 1865 |
& (df["from_email"] == sel.get("from_email"))
|
|
@@ -1874,7 +2001,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1874 |
cid = int(row.get("cluster_id", -99))
|
| 1875 |
clabel = term_names.get(cid, row.get("cluster_name")) if term_names else row.get("cluster_name")
|
| 1876 |
return build_highlighted_html(
|
| 1877 |
-
row,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1878 |
)
|
| 1879 |
|
| 1880 |
results_df.select(
|
|
@@ -1883,12 +2014,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1883 |
outputs=[email_view],
|
| 1884 |
)
|
| 1885 |
|
|
|
|
| 1886 |
def on_click_filter(evt: gr.SelectData, df_sum: pd.DataFrame, col_name: str, out_comp: gr.Dropdown):
|
| 1887 |
if evt.index is None or df_sum is None or df_sum.empty:
|
| 1888 |
return gr.update()
|
| 1889 |
val = df_sum.iloc[evt.index[0]][col_name]
|
| 1890 |
return gr.update(value=val)
|
| 1891 |
|
|
|
|
| 1892 |
cluster_counts_df.select(
|
| 1893 |
lambda evt, df: on_click_filter(evt, df, "label", cluster_drop), [cluster_counts_df], [cluster_drop]
|
| 1894 |
).then(
|
|
@@ -1905,9 +2038,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1905 |
date_end,
|
| 1906 |
sort_by,
|
| 1907 |
sort_dir,
|
|
|
|
| 1908 |
],
|
| 1909 |
outputs=[results_df],
|
| 1910 |
)
|
|
|
|
|
|
|
| 1911 |
domain_counts_df.select(
|
| 1912 |
lambda evt, df: on_click_filter(evt, df, "from_domain", domain_drop), [domain_counts_df], [domain_drop]
|
| 1913 |
).then(
|
|
@@ -1924,9 +2060,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1924 |
date_end,
|
| 1925 |
sort_by,
|
| 1926 |
sort_dir,
|
|
|
|
| 1927 |
],
|
| 1928 |
outputs=[results_df],
|
| 1929 |
)
|
|
|
|
|
|
|
| 1930 |
sender_counts_df.select(
|
| 1931 |
lambda evt, df: on_click_filter(evt, df, "from_email", sender_drop), [sender_counts_df], [sender_drop]
|
| 1932 |
).then(
|
|
@@ -1943,11 +2082,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1943 |
date_end,
|
| 1944 |
sort_by,
|
| 1945 |
sort_dir,
|
|
|
|
| 1946 |
],
|
| 1947 |
outputs=[results_df],
|
| 1948 |
)
|
| 1949 |
|
| 1950 |
-
|
| 1951 |
if __name__ == "__main__":
|
| 1952 |
# Disable SSR to avoid handler arity warnings under server-side rendering
|
| 1953 |
-
demo.launch(ssr_mode=False)
|
|
|
|
| 227 |
NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
|
| 228 |
ONE_CHAR_RE = re.compile(r"^.$")
|
| 229 |
|
| 230 |
+
# ---- NEW junk token guards (base64/hex/trackers/very-long) ----
|
| 231 |
+
LONG_ALNUM_RE = re.compile(r"^[A-Za-z0-9_-]{24,}$") # long tracking/b64-ish
|
| 232 |
+
HEXISH_RE = re.compile(r"^(?:[A-Fa-f0-9]{8,})$") # long hex blobs
|
| 233 |
+
DIGIT_HEAVY_RE = re.compile(r"^(?:\D*\d){6,}\D*$") # too many digits
|
| 234 |
+
UNDERSCORE_HEAVY_RE = re.compile(r"^[A-Za-z0-9]*_[A-Za-z0-9_]*$")
|
| 235 |
+
|
| 236 |
# This stoplist is used by the CountVectorizer (MUST be list for sklearn)
|
| 237 |
STOPWORD_FOR_VEC = sorted(EN_STOP | HE_STOP | STOP_TERMS)
|
| 238 |
|
| 239 |
def _is_junk_term(t: str) -> bool:
|
| 240 |
+
tl = (t or "").strip().lower()
|
| 241 |
+
if not tl: return True
|
| 242 |
+
if tl in STOP_TERMS or tl in EN_STOP or tl in HE_STOP or tl in MONTHS: return True
|
| 243 |
if EMAIL_LIKE_RE.search(tl): return True
|
| 244 |
if YEAR_RE.match(tl): return True
|
| 245 |
if NUMERIC_RE.match(tl): return True
|
| 246 |
if ONE_CHAR_RE.match(tl): return True
|
| 247 |
+
if LONG_ALNUM_RE.match(t): return True
|
| 248 |
+
if HEXISH_RE.match(t): return True
|
| 249 |
+
if DIGIT_HEAVY_RE.match(t): return True
|
| 250 |
+
if UNDERSCORE_HEAVY_RE.match(t): return True
|
| 251 |
+
if len(t) > 40: return True
|
| 252 |
return False
|
| 253 |
|
| 254 |
def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
|
|
|
|
| 697 |
tokens.append(lang_tok)
|
| 698 |
return (t + " " + " ".join(tokens)).strip()
|
| 699 |
|
| 700 |
+
# =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST (+ coverage ≥30% preference) ===================
|
| 701 |
+
def cluster_labels_pmi_bigram(
|
| 702 |
+
texts,
|
| 703 |
+
labels,
|
| 704 |
+
subjects=None,
|
| 705 |
+
topn=6,
|
| 706 |
+
subject_alpha=0.75,
|
| 707 |
+
global_ubiq_cut=0.20,
|
| 708 |
+
subject_min_cov=0.30 # NEW: prefer subject terms that appear in ≥30% of a cluster's subjects
|
| 709 |
+
):
|
| 710 |
"""
|
| 711 |
Improved labeler:
|
| 712 |
- Considers bigrams AND trigrams (PMI vs. global)
|
| 713 |
- Class-TFIDF unigrams with subject coverage boost
|
| 714 |
- Suppresses globally ubiquitous tokens/phrases (appear in >20% docs by default)
|
| 715 |
+
- NEW: prefers subject terms that occur in ≥30% of cluster subjects (can be tuned via subject_min_cov)
|
| 716 |
"""
|
| 717 |
import math as _math
|
| 718 |
from collections import Counter, defaultdict
|
|
|
|
| 724 |
def is_junk_token(tok: str) -> bool:
|
| 725 |
if _is_junk_term(tok): return True
|
| 726 |
tl = tok.lower()
|
| 727 |
+
if tl.startswith("__"): return True # our feature flags
|
|
|
|
|
|
|
| 728 |
if "@" in tl: return True
|
| 729 |
if tl.isascii() and len(tl) <= 2: return True
|
| 730 |
+
if LONG_ALNUM_RE.match(tok) or HEXISH_RE.match(tok) or DIGIT_HEAVY_RE.match(tok): return True
|
| 731 |
+
if len(tok) > 40: return True
|
| 732 |
+
# strip punctuation-heavy artifacts (URLs already replaced with 'URL')
|
| 733 |
+
if re.search(r"[^\w\-’']", tl): return True
|
| 734 |
return False
|
| 735 |
|
| 736 |
def tokenize_clean(t):
|
|
|
|
| 790 |
for c in sorted(set(int(x) for x in labels)):
|
| 791 |
n_docs_c = max(1, per_c_doc_count[c])
|
| 792 |
|
| 793 |
+
# ===== PMI bigrams & trigrams with subject-coverage boost & ≥30% preference =====
|
| 794 |
phrases = []
|
| 795 |
for store, glob_df, subj_docs, n in (
|
| 796 |
(per_c_bg[c], glob_df_bg, per_c_subj_bg_docs[c], 2),
|
|
|
|
| 809 |
cov = 0.0
|
| 810 |
if have_subjects:
|
| 811 |
cov = subj_docs[ng] / n_docs_c
|
| 812 |
+
# add a preference bump if subject coverage ≥ threshold
|
| 813 |
+
if cov >= subject_min_cov:
|
| 814 |
+
score += 0.6 # modest, ensures such terms bubble up
|
| 815 |
score += subject_alpha * cov
|
| 816 |
+
scored.append((score, cov, ng))
|
| 817 |
+
# prefer subject-coverage ≥ threshold first, then highest score
|
| 818 |
+
scored.sort(key=lambda x: (x[1] >= subject_min_cov, x[0]), reverse=True)
|
| 819 |
take = max(1, topn // (3 if n == 3 else 2))
|
| 820 |
+
phrases.extend([p for _, _, p in scored[:take]])
|
| 821 |
|
| 822 |
+
# ===== Class-TFIDF unigrams with subject coverage boost & ≥30% preference =====
|
| 823 |
docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
|
| 824 |
docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
|
| 825 |
corpus = [docs_c[0], docs_bg[0]]
|
|
|
|
| 831 |
vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
|
| 832 |
row = X[0].toarray().ravel()
|
| 833 |
|
|
|
|
| 834 |
subj_cov = np.zeros_like(row)
|
| 835 |
+
subj_cov_frac = np.zeros_like(row)
|
| 836 |
+
vocab_index = {t:i for i,t in enumerate(vocab)}
|
| 837 |
if have_subjects:
|
|
|
|
| 838 |
for tok, cnt_docs in per_c_subj_uni_docs[c].items():
|
| 839 |
if tok in vocab_index and not is_junk_token(tok):
|
| 840 |
+
i = vocab_index[tok]
|
| 841 |
+
frac = cnt_docs / n_docs_c
|
| 842 |
+
subj_cov[i] = frac
|
| 843 |
+
subj_cov_frac[i] = frac
|
| 844 |
|
| 845 |
+
# base + subject alpha
|
| 846 |
row_boosted = row + subject_alpha * subj_cov
|
| 847 |
+
# final score gets a preference bump if subj coverage ≥ threshold (but not a hard filter)
|
| 848 |
+
pref_bump = (subj_cov_frac >= subject_min_cov).astype(row_boosted.dtype) * 0.6
|
| 849 |
+
final = row_boosted + pref_bump
|
| 850 |
+
|
| 851 |
+
order = final.argsort()[::-1]
|
| 852 |
unis = []
|
| 853 |
for i in order:
|
| 854 |
tok = vocab[i]
|
|
|
|
| 956 |
return seeds_red
|
| 957 |
return None
|
| 958 |
|
| 959 |
+
# =================== NEW: cluster stabilizer (merge near-dupes + reassign tiny → big; else NOISE=-3) ===================
|
| 960 |
+
def _centroids_from_labels(X, labels):
|
| 961 |
+
labs = np.asarray(labels, dtype=int)
|
| 962 |
+
uniq = np.unique(labs)
|
| 963 |
+
cents = {}
|
| 964 |
+
if isinstance(X, np.ndarray):
|
| 965 |
+
for c in uniq:
|
| 966 |
+
idx = (labs == c)
|
| 967 |
+
if not np.any(idx): continue
|
| 968 |
+
v = X[idx].mean(axis=0)
|
| 969 |
+
n = np.linalg.norm(v)
|
| 970 |
+
if n > 0: v = v / n
|
| 971 |
+
cents[int(c)] = v.astype(np.float32)
|
| 972 |
+
return cents
|
| 973 |
+
# CSR sparse
|
| 974 |
+
X = X.tocsr()
|
| 975 |
+
for c in uniq:
|
| 976 |
+
rows = np.where(labs == c)[0]
|
| 977 |
+
if rows.size == 0: continue
|
| 978 |
+
sub = X[rows]
|
| 979 |
+
v = np.asarray(sub.mean(axis=0)).ravel()
|
| 980 |
+
n = np.linalg.norm(v)
|
| 981 |
+
if n > 0: v = v / n
|
| 982 |
+
cents[int(c)] = v.astype(np.float32)
|
| 983 |
+
return cents
|
| 984 |
+
|
| 985 |
+
def _cosine_sim_to_centroids(vecs, centroids):
|
| 986 |
+
if not centroids:
|
| 987 |
+
return None, None
|
| 988 |
+
keys = list(centroids.keys())
|
| 989 |
+
C = np.stack([centroids[k] for k in keys], axis=0) # (k,d)
|
| 990 |
+
if isinstance(vecs, np.ndarray):
|
| 991 |
+
sims = vecs @ C.T
|
| 992 |
+
else:
|
| 993 |
+
sims = vecs.dot(C.T)
|
| 994 |
+
if hasattr(sims, "toarray"): sims = sims.toarray()
|
| 995 |
+
best_idx = np.argmax(sims, axis=1)
|
| 996 |
+
best_lab = np.array([keys[i] for i in best_idx], dtype=int)
|
| 997 |
+
best_sim = sims[np.arange(sims.shape[0]), best_idx]
|
| 998 |
+
return best_lab, best_sim
|
| 999 |
+
|
| 1000 |
+
def stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35):
|
| 1001 |
+
labs = np.asarray(labels, dtype=int)
|
| 1002 |
+
|
| 1003 |
+
# 1) merge very close centroids
|
| 1004 |
+
cents = _centroids_from_labels(X_space, labs)
|
| 1005 |
+
keys = sorted([k for k in cents.keys() if k >= 0])
|
| 1006 |
+
if len(keys) >= 2:
|
| 1007 |
+
C = np.stack([cents[k] for k in keys], axis=0)
|
| 1008 |
+
sims = C @ C.T
|
| 1009 |
+
parent = {k:k for k in keys}
|
| 1010 |
+
def find(a):
|
| 1011 |
+
while parent[a]!=a:
|
| 1012 |
+
a = parent[a]
|
| 1013 |
+
return a
|
| 1014 |
+
for i in range(len(keys)):
|
| 1015 |
+
for j in range(i+1, len(keys)):
|
| 1016 |
+
if sims[i,j] >= float(merge_thresh):
|
| 1017 |
+
ri, rj = find(keys[i]), find(keys[j])
|
| 1018 |
+
if ri != rj:
|
| 1019 |
+
parent[rj] = ri
|
| 1020 |
+
root = {k: find(k) for k in keys}
|
| 1021 |
+
merge_map = {k: root[k] for k in keys}
|
| 1022 |
+
labs = np.array([merge_map.get(int(c), int(c)) for c in labs], dtype=int)
|
| 1023 |
+
cents = _centroids_from_labels(X_space, labs)
|
| 1024 |
+
|
| 1025 |
+
# 2) reassign tiny clusters to nearest big centroid (else noise -3)
|
| 1026 |
+
vc = pd.Series(labs).value_counts()
|
| 1027 |
+
big_labs = set(vc[vc >= int(min_size)].index.tolist())
|
| 1028 |
+
small_labs = set(vc[vc < int(min_size)].index.tolist())
|
| 1029 |
+
big_cents = {c: cents[c] for c in big_labs if c in cents and c >= 0}
|
| 1030 |
+
|
| 1031 |
+
NOISE_ID = -3
|
| 1032 |
+
if small_labs and big_cents:
|
| 1033 |
+
idx_small = np.where(pd.Series(labs).isin(small_labs))[0]
|
| 1034 |
+
if idx_small.size > 0:
|
| 1035 |
+
sub = X_space[idx_small] if not isinstance(X_space, np.ndarray) else X_space[idx_small]
|
| 1036 |
+
best_lab, best_sim = _cosine_sim_to_centroids(sub, big_cents)
|
| 1037 |
+
reassigned = np.where(best_sim >= float(reassign_thresh), best_lab, NOISE_ID)
|
| 1038 |
+
labs[idx_small] = reassigned
|
| 1039 |
+
|
| 1040 |
+
return labs
|
| 1041 |
+
|
| 1042 |
# =================== Scoring & Flags ===================
|
| 1043 |
def _hour_of(dt_iso: str) -> Optional[int]:
|
| 1044 |
try:
|
| 1045 |
if not dt_iso: return None
|
| 1046 |
dt = pd.to_datetime(dt_iso, utc=True, errors="coerce")
|
| 1047 |
if pd.isna(dt): return None
|
|
|
|
| 1048 |
return int(dt.hour)
|
| 1049 |
except Exception:
|
| 1050 |
return None
|
|
|
|
| 1074 |
body_len = len(row.get("body_text",""))
|
| 1075 |
if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
|
| 1076 |
score += 0.5
|
|
|
|
| 1077 |
fd = (row.get("from_domain") or "").lower()
|
| 1078 |
if fd in PERSONAL_DOMAINS and fd not in trusted_domains:
|
| 1079 |
score += 0.5
|
|
|
|
| 1080 |
h = _hour_of(row.get("date") or "")
|
| 1081 |
if h is not None and (h < 6 or h > 22):
|
| 1082 |
score += 0.3
|
|
|
|
| 1156 |
date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
|
| 1157 |
sort_by = gr.Dropdown(label="Sort by", choices=["corruption_score","date","anomaly_score","search_score"], value="corruption_score")
|
| 1158 |
sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
|
| 1159 |
+
# NEW: hide noise toggle
|
| 1160 |
+
hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)
|
| 1161 |
|
| 1162 |
with gr.Row():
|
| 1163 |
run_btn = gr.Button("Process", variant="primary")
|
|
|
|
| 1235 |
tag_value: str,
|
| 1236 |
start: str,
|
| 1237 |
end: str,
|
| 1238 |
+
hide_noise_flag: bool = False, # NEW
|
| 1239 |
) -> pd.DataFrame:
|
| 1240 |
out = df
|
| 1241 |
if cluster and cluster != "(any)":
|
|
|
|
| 1266 |
out = out[pd.to_datetime(out["date"], utc=True, errors="coerce") <= dt]
|
| 1267 |
except Exception:
|
| 1268 |
pass
|
| 1269 |
+
if hide_noise_flag:
|
| 1270 |
+
out = out[out["cluster_id"] != -3]
|
| 1271 |
return out
|
| 1272 |
|
| 1273 |
# -------- Simple social network stats --------
|
| 1274 |
def social_stats(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
| 1275 |
deg = {}
|
| 1276 |
def add_edge(a,b):
|
| 1277 |
if not a or not b or a==b: return
|
|
|
|
| 1302 |
|
| 1303 |
# === Vectorization & Clustering (UPGRADED) ===
|
| 1304 |
def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
|
|
|
|
| 1305 |
texts = list(df_in.apply(enrich_text, axis=1))
|
| 1306 |
subjects_only = list(df_in["subject"].fillna(""))
|
| 1307 |
return texts, subjects_only
|
|
|
|
| 1343 |
count_vec = None; bm25 = None
|
| 1344 |
return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full
|
| 1345 |
|
|
|
|
| 1346 |
count_vec = CountVectorizer(
|
| 1347 |
analyzer="word", ngram_range=ngram_range,
|
| 1348 |
max_features=int(max_features) if max_features else None,
|
|
|
|
| 1372 |
|
| 1373 |
n_docs = X_full.shape[0]
|
| 1374 |
n_feats = X_full.shape[1]
|
|
|
|
| 1375 |
max_components = max(1, min(n_docs, n_feats) - 1)
|
| 1376 |
n_comp = int(min(int(lsa_dim or 256), max_components))
|
| 1377 |
|
| 1378 |
if n_comp < 2:
|
|
|
|
| 1379 |
return None, None, None
|
| 1380 |
|
| 1381 |
svd_obj = TruncatedSVD(n_components=n_comp, random_state=0)
|
|
|
|
| 1386 |
return X_reduced, svd_obj, norm_obj
|
| 1387 |
|
| 1388 |
def _attach_embeddings(texts, X_reduced_or_full, use_lsa, kv, emb_dim, weight):
|
|
|
|
|
|
|
|
|
|
| 1389 |
if kv is None or emb_dim <= 0 or weight <= 0.0:
|
| 1390 |
return X_reduced_or_full, emb_dim
|
| 1391 |
doc_embs = _build_doc_embeddings(texts, kv, emb_dim).astype(np.float32)
|
|
|
|
| 1412 |
norm_obj,
|
| 1413 |
d_word, d_char
|
| 1414 |
):
|
|
|
|
|
|
|
|
|
|
| 1415 |
n = X_space.shape[0]
|
| 1416 |
|
|
|
|
| 1417 |
if n <= 1:
|
| 1418 |
labels = np.zeros((n,), dtype=int) if n == 1 else np.array([], dtype=int)
|
| 1419 |
centers = None
|
| 1420 |
chosen_k = int(n) if n > 0 else 0
|
| 1421 |
return labels, centers, chosen_k
|
| 1422 |
if n < 10:
|
|
|
|
| 1423 |
k_small = min(max(2, n // 2), n)
|
| 1424 |
kmeans = MiniBatchKMeans(
|
| 1425 |
n_clusters=int(k_small),
|
|
|
|
| 1445 |
chosen_k = int(len(set([l for l in labels if l >= 0])))
|
| 1446 |
return labels, centers, chosen_k
|
| 1447 |
|
|
|
|
| 1448 |
if bool(auto_k):
|
| 1449 |
if use_lsa and isinstance(X_space, np.ndarray):
|
| 1450 |
k, _ = choose_k_by_kneedle(X_space, ks=(50, 100, 150, 200, 300, 400, 500))
|
|
|
|
| 1476 |
chosen_k = int(len(set(labels)))
|
| 1477 |
return labels, centers, chosen_k
|
| 1478 |
|
|
|
|
| 1479 |
trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
|
| 1480 |
extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
|
| 1481 |
extra_terms_lower = [t.lower() for t in extra_terms]
|
|
|
|
| 1526 |
if bool(use_embeddings):
|
| 1527 |
kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
|
| 1528 |
|
|
|
|
| 1529 |
parts = []
|
| 1530 |
if bool(per_language) and "lang" in df_main.columns:
|
| 1531 |
for lang_code, grp in df_main.groupby("lang", dropna=False):
|
|
|
|
| 1589 |
d_word=d_word,
|
| 1590 |
d_char=d_char,
|
| 1591 |
)
|
| 1592 |
+
# NEW: stabilize per partition
|
| 1593 |
+
labels = stabilize_labels(
|
| 1594 |
+
X_space, labels,
|
| 1595 |
+
min_size=40,
|
| 1596 |
+
merge_thresh=0.96,
|
| 1597 |
+
reassign_thresh=0.35,
|
| 1598 |
+
)
|
| 1599 |
+
|
| 1600 |
+
k_agg += len(set(labels))
|
| 1601 |
|
| 1602 |
term_names = cluster_labels_pmi_bigram(
|
| 1603 |
+
texts=texts, labels=labels, subjects=subjects_only,
|
| 1604 |
+
topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
|
| 1605 |
)
|
| 1606 |
term_names_global.update({int(k): v for k, v in term_names.items()})
|
| 1607 |
|
|
|
|
| 1608 |
labels_list.append(pd.Series(labels, index=df_part.index))
|
| 1609 |
cluster_name_list.append(
|
| 1610 |
pd.Series(
|
| 1611 |
+
[term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels],
|
| 1612 |
index=df_part.index,
|
| 1613 |
)
|
| 1614 |
)
|
|
|
|
| 1626 |
df_main["cluster_name"] = "unclustered"
|
| 1627 |
df_main["anomaly_score"] = np.nan
|
| 1628 |
|
|
|
|
| 1629 |
if len(df_news):
|
| 1630 |
df_news.loc[:, "cluster_id"] = -1
|
| 1631 |
df_news.loc[:, "cluster_name"] = "newsletter/news"
|
|
|
|
| 1654 |
except Exception:
|
| 1655 |
pass
|
| 1656 |
|
|
|
|
| 1657 |
cluster_counts = (
|
| 1658 |
df.groupby(["cluster_id", "cluster_name"])
|
| 1659 |
.size()
|
|
|
|
| 1692 |
"max_df": float(max_df),
|
| 1693 |
"use_bigrams": bool(use_bigrams),
|
| 1694 |
}
|
| 1695 |
+
status_md = f"**Processed {len(df):,} emails** | clusters ~ {len(cluster_counts):,} (showing top 500)"
|
| 1696 |
|
| 1697 |
svd_obj_out = svd_obj_local if single_partition else None
|
| 1698 |
norm_obj_out = norm_obj_local if single_partition else None
|
|
|
|
| 1817 |
|
| 1818 |
return tmp[[c for c in cols_out if c in tmp.columns]].head(500)
|
| 1819 |
|
| 1820 |
+
def refresh_results(df, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
|
| 1821 |
if df is None or len(df) == 0:
|
| 1822 |
return pd.DataFrame()
|
| 1823 |
+
filt = _apply_filters(
|
| 1824 |
+
df, cluster, domain, sender, lang, sentiment, tag, start, end, hide_noise_flag=bool(hide_noise_flag)
|
| 1825 |
+
)
|
| 1826 |
return _sort_results(filt, sort_by, sort_dir)
|
| 1827 |
|
| 1828 |
+
# Re-run when any filter control changes (including hide_noise)
|
| 1829 |
+
for ctrl in [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
|
| 1830 |
+
date_start, date_end, sort_by, sort_dir, hide_noise]:
|
| 1831 |
ctrl.change(
|
| 1832 |
refresh_results,
|
| 1833 |
inputs=[
|
|
|
|
| 1842 |
date_end,
|
| 1843 |
sort_by,
|
| 1844 |
sort_dir,
|
| 1845 |
+
hide_noise,
|
| 1846 |
],
|
| 1847 |
outputs=[results_df],
|
| 1848 |
)
|
| 1849 |
|
| 1850 |
+
# Reset filters (sets selects to (any), dates blank, sort default, and hide_noise= True)
|
| 1851 |
reset_btn.click(
|
| 1852 |
+
lambda: ["(any)"] * 6 + [""] * 2 + ["corruption_score", "desc"] + [True],
|
| 1853 |
[],
|
| 1854 |
+
[cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
|
| 1855 |
+
date_start, date_end, sort_by, sort_dir, hide_noise],
|
| 1856 |
).then(
|
| 1857 |
refresh_results,
|
| 1858 |
inputs=[
|
|
|
|
| 1867 |
date_end,
|
| 1868 |
sort_by,
|
| 1869 |
sort_dir,
|
| 1870 |
+
hide_noise,
|
| 1871 |
],
|
| 1872 |
outputs=[results_df],
|
| 1873 |
)
|
| 1874 |
|
| 1875 |
+
# -------- Search helpers --------
|
| 1876 |
def _tokenize_query(q: str) -> List[str]:
|
| 1877 |
+
return [p.strip() for p in re.split(r"\s+", q or "") if p.strip()][:8]
|
| 1878 |
|
| 1879 |
def _project_query_to_lsa(q_vec, svd, norm) -> Optional[np.ndarray]:
|
| 1880 |
try:
|
| 1881 |
return norm.transform(svd.transform(q_vec)).astype(np.float32)
|
| 1882 |
+
except Exception:
|
| 1883 |
return None
|
| 1884 |
|
| 1885 |
def _vectorize_query(q, vec_state, corpus_texts):
|
| 1886 |
+
# Build the same features for the query that we used for docs
|
| 1887 |
char_min_df = 1 if len(corpus_texts) <= 1 else 2
|
| 1888 |
+
|
| 1889 |
if vec_state.get("use_hashing"):
|
| 1890 |
hv = HashingVectorizer(
|
| 1891 |
analyzer="word",
|
|
|
|
| 1893 |
n_features=2 ** vec_state.get("hash_bits", 18),
|
| 1894 |
token_pattern=TOKEN_PATTERN,
|
| 1895 |
lowercase=True,
|
| 1896 |
+
norm=None,
|
| 1897 |
+
alternate_sign=False,
|
| 1898 |
)
|
| 1899 |
+
# Fit TF-IDF weights from corpus
|
| 1900 |
counts = hv.transform(corpus_texts)
|
| 1901 |
tfidf_tr = TfidfTransformer().fit(counts)
|
| 1902 |
q_word = tfidf_tr.transform(hv.transform([q]))
|
|
|
|
| 1910 |
token_pattern=TOKEN_PATTERN,
|
| 1911 |
lowercase=True,
|
| 1912 |
stop_words=STOPWORD_FOR_VEC,
|
| 1913 |
+
dtype=np.float32,
|
| 1914 |
)
|
| 1915 |
tf = cv.fit_transform(corpus_texts)
|
| 1916 |
bm25 = BM25Transformer().fit(tf)
|
| 1917 |
q_word = bm25.transform(cv.transform([q]))
|
| 1918 |
|
| 1919 |
char_vec = CharTfidf(
|
| 1920 |
+
analyzer="char", ngram_range=(3, 5), min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32
|
| 1921 |
).fit(corpus_texts)
|
| 1922 |
q_char = char_vec.transform([q])
|
| 1923 |
+
|
| 1924 |
return hstack([q_word, q_char * 0.20], format="csr")
|
| 1925 |
|
| 1926 |
def search_fn(q, df, vec, X_red, index, use_lsa, use_faiss, svd, norm, sort, sdir):
|
| 1927 |
if not q or df is None or vec is None or index is None:
|
| 1928 |
return pd.DataFrame(), []
|
| 1929 |
|
| 1930 |
+
# Search ignores newsletters/alerts/noise by default
|
| 1931 |
mask = ~df["cluster_id"].isin([-1, -2, -3])
|
| 1932 |
df_main = df[mask].reset_index(drop=True)
|
| 1933 |
if df_main.empty:
|
|
|
|
| 1936 |
q_terms = _tokenize_query(q)
|
| 1937 |
q_vec = _vectorize_query(q, vec, list(df_main.apply(enrich_text, axis=1)))
|
| 1938 |
|
| 1939 |
+
q_emb = _project_query_to_lsa(q_vec, svd, norm) if use_lsa and svd is not None and norm is not None else q_vec
|
| 1940 |
if q_emb is None:
|
| 1941 |
return pd.DataFrame(), q_terms
|
| 1942 |
|
|
|
|
| 1945 |
return pd.DataFrame(), q_terms
|
| 1946 |
|
| 1947 |
if isinstance(index, NearestNeighbors):
|
| 1948 |
+
# brute-force cosine on reduced space
|
| 1949 |
if hasattr(index, "n_samples_fit_") and index.n_samples_fit_ <= 1:
|
| 1950 |
return pd.DataFrame(), q_terms
|
| 1951 |
dists, inds = index.kneighbors(q_emb, n_neighbors=n_req)
|
| 1952 |
sims = 1.0 - dists[0]
|
| 1953 |
results = df_main.iloc[inds[0]].copy()
|
| 1954 |
results["search_score"] = sims
|
| 1955 |
+
elif use_faiss and FAISS_OK and hasattr(index, "search"):
|
| 1956 |
D, I = index.search(q_emb.astype(np.float32), k=n_req)
|
| 1957 |
results = df_main.iloc[I[0]].copy()
|
| 1958 |
results["search_score"] = D[0]
|
|
|
|
| 1979 |
outputs=[results_df, state_query_terms],
|
| 1980 |
)
|
| 1981 |
|
| 1982 |
+
# -------- Reader selection (build highlighted HTML) --------
|
| 1983 |
def on_row_select(evt: gr.SelectData, table, df, term_names, q_terms, extra_terms, do_highlight):
|
| 1984 |
if evt.index is None or table is None or len(table) == 0 or df is None or len(df) == 0:
|
| 1985 |
return ""
|
| 1986 |
row_idx = evt.index[0]
|
| 1987 |
sel = table.iloc[row_idx]
|
| 1988 |
|
| 1989 |
+
# Try to match the original row
|
| 1990 |
cand = df[
|
| 1991 |
(df["subject"] == sel.get("subject"))
|
| 1992 |
& (df["from_email"] == sel.get("from_email"))
|
|
|
|
| 2001 |
cid = int(row.get("cluster_id", -99))
|
| 2002 |
clabel = term_names.get(cid, row.get("cluster_name")) if term_names else row.get("cluster_name")
|
| 2003 |
return build_highlighted_html(
|
| 2004 |
+
row,
|
| 2005 |
+
query_terms=q_terms,
|
| 2006 |
+
cluster_label=clabel,
|
| 2007 |
+
do_highlight=do_highlight,
|
| 2008 |
+
extra_terms=extra_terms,
|
| 2009 |
)
|
| 2010 |
|
| 2011 |
results_df.select(
|
|
|
|
| 2014 |
outputs=[email_view],
|
| 2015 |
)
|
| 2016 |
|
| 2017 |
+
# Click-to-filter conveniences for summary tables
|
| 2018 |
def on_click_filter(evt: gr.SelectData, df_sum: pd.DataFrame, col_name: str, out_comp: gr.Dropdown):
|
| 2019 |
if evt.index is None or df_sum is None or df_sum.empty:
|
| 2020 |
return gr.update()
|
| 2021 |
val = df_sum.iloc[evt.index[0]][col_name]
|
| 2022 |
return gr.update(value=val)
|
| 2023 |
|
| 2024 |
+
# Cluster summary → set cluster filter
|
| 2025 |
cluster_counts_df.select(
|
| 2026 |
lambda evt, df: on_click_filter(evt, df, "label", cluster_drop), [cluster_counts_df], [cluster_drop]
|
| 2027 |
).then(
|
|
|
|
| 2038 |
date_end,
|
| 2039 |
sort_by,
|
| 2040 |
sort_dir,
|
| 2041 |
+
hide_noise,
|
| 2042 |
],
|
| 2043 |
outputs=[results_df],
|
| 2044 |
)
|
| 2045 |
+
|
| 2046 |
+
# Domain summary → set domain filter
|
| 2047 |
domain_counts_df.select(
|
| 2048 |
lambda evt, df: on_click_filter(evt, df, "from_domain", domain_drop), [domain_counts_df], [domain_drop]
|
| 2049 |
).then(
|
|
|
|
| 2060 |
date_end,
|
| 2061 |
sort_by,
|
| 2062 |
sort_dir,
|
| 2063 |
+
hide_noise,
|
| 2064 |
],
|
| 2065 |
outputs=[results_df],
|
| 2066 |
)
|
| 2067 |
+
|
| 2068 |
+
# Sender summary → set sender filter
|
| 2069 |
sender_counts_df.select(
|
| 2070 |
lambda evt, df: on_click_filter(evt, df, "from_email", sender_drop), [sender_counts_df], [sender_drop]
|
| 2071 |
).then(
|
|
|
|
| 2082 |
date_end,
|
| 2083 |
sort_by,
|
| 2084 |
sort_dir,
|
| 2085 |
+
hide_noise,
|
| 2086 |
],
|
| 2087 |
outputs=[results_df],
|
| 2088 |
)
|
| 2089 |
|
|
|
|
| 2090 |
if __name__ == "__main__":
|
| 2091 |
# Disable SSR to avoid handler arity warnings under server-side rendering
|
| 2092 |
+
demo.launch(ssr_mode=False)
|