Update app.py
Browse files
app.py
CHANGED
|
@@ -103,6 +103,17 @@ OFFCHANNEL_RE = re.compile("|".join(OFFCHANNEL_PATTERNS), re.I)
|
|
| 103 |
# Common personal mail domains (used with user-specified trusted org domains)
|
| 104 |
PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
# Optional seeded themes for semi-supervised init (used only when LSA is ON)
|
| 107 |
CORR_LEX = {
|
| 108 |
"kickback" : ["kickback","bribe","under the table","gift","cash"],
|
|
@@ -111,7 +122,7 @@ CORR_LEX = {
|
|
| 111 |
"money_flow" : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
|
| 112 |
}
|
| 113 |
|
| 114 |
-
# =================== Label cleanup helpers
|
| 115 |
EN_STOP = {
|
| 116 |
"the","of","and","to","in","is","for","on","at","with","from","by","or","as",
|
| 117 |
"that","this","it","be","are","was","were","an","a","you","your","we","our","us",
|
|
@@ -522,37 +533,79 @@ def enrich_text(row: pd.Series) -> str:
|
|
| 522 |
if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
|
| 523 |
if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
|
| 524 |
if OFFCHANNEL_RE.search(t): tokens.append("__OFF_CHANNEL__")
|
|
|
|
|
|
|
| 525 |
return (t + " " + " ".join(tokens)).strip()
|
| 526 |
|
| 527 |
-
# =================== Cluster labeling: PMI
|
| 528 |
def cluster_labels_pmi_bigram(texts, labels, topn=6):
|
| 529 |
-
def bigrams(t):
|
| 530 |
-
toks = re.findall(TOKEN_PATTERN, t.lower())
|
| 531 |
-
return [" ".join(p) for p in zip(toks, toks[1:])]
|
| 532 |
-
N = len(texts)
|
| 533 |
-
from collections import Counter
|
| 534 |
import math as _math
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
glob_bg = Counter()
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
|
|
|
|
|
|
|
|
|
| 539 |
glob_bg.update(bgs)
|
| 540 |
-
|
|
|
|
|
|
|
| 541 |
labels_out = {}
|
| 542 |
-
total_bg = sum(glob_bg.values()) + 1e-
|
| 543 |
-
|
| 544 |
-
|
|
|
|
| 545 |
scores = []
|
| 546 |
-
total_c = sum(
|
| 547 |
-
for bg, cnt in
|
| 548 |
p_bg_c = cnt / total_c
|
| 549 |
p_bg = (glob_bg[bg] / total_bg)
|
| 550 |
if p_bg > 0 and p_bg_c > 0:
|
| 551 |
score = _math.log(p_bg_c) - _math.log(p_bg)
|
| 552 |
scores.append((score, bg))
|
| 553 |
scores.sort(reverse=True)
|
| 554 |
-
|
| 555 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
return labels_out
|
| 557 |
|
| 558 |
# =================== Auto-k & merge ===================
|
|
@@ -707,13 +760,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 707 |
with gr.Accordion("Vectorization & Clustering", open=True):
|
| 708 |
with gr.Row():
|
| 709 |
max_features = gr.Number(label="Word max_features (BM25)", value=120_000, precision=0)
|
| 710 |
-
min_df = gr.Number(label="min_df (doc freq ≥)", value=
|
| 711 |
max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
|
| 712 |
use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
|
| 713 |
skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
|
| 714 |
with gr.Row():
|
| 715 |
use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
|
| 716 |
-
lsa_dim = gr.Number(label="LSA components", value=
|
| 717 |
auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
|
| 718 |
k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
|
| 719 |
mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
|
|
@@ -878,7 +931,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 878 |
# trusted org domains
|
| 879 |
trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
|
| 880 |
extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
|
| 881 |
-
# extend
|
| 882 |
extra_terms_lower = [t.lower() for t in extra_terms]
|
| 883 |
|
| 884 |
recs = _load_json_records(inbox_file.name)
|
|
@@ -921,8 +974,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 921 |
flags.append(f)
|
| 922 |
df["flags"] = flags
|
| 923 |
|
| 924 |
-
#
|
| 925 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 926 |
|
| 927 |
# === Vectorization ===
|
| 928 |
ngram_range = (1, 2) if use_bigrams else (1, 1)
|
|
@@ -946,7 +1004,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 946 |
)
|
| 947 |
X_char = char_vec.fit_transform(texts)
|
| 948 |
|
| 949 |
-
|
|
|
|
| 950 |
d_word = X_word.shape[1]
|
| 951 |
d_char = X_char.shape[1]
|
| 952 |
d_full = X_full.shape[1]
|
|
@@ -965,16 +1024,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 965 |
gc.collect()
|
| 966 |
|
| 967 |
# Optional anomaly detection (on LSA space)
|
| 968 |
-
anomaly_scores = np.full((len(
|
| 969 |
if use_lsa and bool(use_iso) and ISO_OK and X_reduced is not None and X_reduced.shape[0] >= 50:
|
| 970 |
try:
|
| 971 |
iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
|
| 972 |
iso.fit(X_reduced)
|
| 973 |
-
# higher
|
| 974 |
-
anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32)
|
| 975 |
except Exception:
|
| 976 |
pass
|
| 977 |
-
df["anomaly_score"] = anomaly_scores
|
| 978 |
|
| 979 |
# K selection
|
| 980 |
if bool(auto_k):
|
|
@@ -1006,20 +1063,29 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1006 |
)
|
| 1007 |
labels = kmeans.fit_predict(X_space)
|
| 1008 |
|
| 1009 |
-
# Merge very-similar clusters (LSA only)
|
| 1010 |
if use_lsa:
|
| 1011 |
-
labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.
|
| 1012 |
|
| 1013 |
-
|
| 1014 |
-
|
| 1015 |
-
# Cluster names
|
| 1016 |
term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
|
| 1017 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1018 |
|
| 1019 |
-
#
|
|
|
|
|
|
|
|
|
|
| 1020 |
df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
|
| 1021 |
|
| 1022 |
-
# Build search index
|
| 1023 |
use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
|
| 1024 |
index_obj = None
|
| 1025 |
if use_faiss:
|
|
@@ -1033,7 +1099,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1033 |
|
| 1034 |
# Summaries
|
| 1035 |
cluster_counts = (
|
| 1036 |
-
df
|
|
|
|
| 1037 |
.reset_index(name="count")
|
| 1038 |
.sort_values("count", ascending=False)
|
| 1039 |
.head(500)
|
|
@@ -1041,6 +1108,15 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1041 |
cluster_counts["label"] = cluster_counts.apply(
|
| 1042 |
lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
|
| 1043 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1044 |
cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
|
| 1045 |
|
| 1046 |
domain_counts = (
|
|
@@ -1084,7 +1160,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1084 |
|
| 1085 |
status_md = (
|
| 1086 |
f"**Processed {len(df):,} emails** \n"
|
| 1087 |
-
f"Word feats (BM25): {d_word:,} | Char feats: {d_char:,} | Total: {d_full:,} \n"
|
| 1088 |
f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
|
| 1089 |
f"k = {k} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'} | "
|
| 1090 |
f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
|
|
@@ -1143,7 +1219,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1143 |
elif by == "anomaly_score" and "anomaly_score" in tmp.columns:
|
| 1144 |
tmp = tmp.sort_values(["anomaly_score","_dt"], ascending=[asc, not asc])
|
| 1145 |
else:
|
| 1146 |
-
# corruption_score or search_score (if present)
|
| 1147 |
col = by if by in tmp.columns else "corruption_score"
|
| 1148 |
tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
|
| 1149 |
tmp = tmp.drop(columns=["_dt"])
|
|
@@ -1216,16 +1291,16 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1216 |
q_emb = q_vec_full
|
| 1217 |
|
| 1218 |
if isinstance(index_obj, NearestNeighbors):
|
| 1219 |
-
distances, indices = index_obj.kneighbors(q_emb, n_neighbors=min(50, len(df)))
|
| 1220 |
inds = indices[0]
|
| 1221 |
sims = 1.0 - distances[0]
|
| 1222 |
-
results = df.iloc[inds].copy()
|
| 1223 |
results["search_score"] = sims
|
| 1224 |
elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
|
| 1225 |
-
D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df)))
|
| 1226 |
inds = I[0]
|
| 1227 |
sims = D[0]
|
| 1228 |
-
results = df.iloc[inds].copy()
|
| 1229 |
results["search_score"] = sims
|
| 1230 |
else:
|
| 1231 |
return pd.DataFrame(), q_terms
|
|
@@ -1238,7 +1313,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1238 |
if sort_by == "search_score":
|
| 1239 |
results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
|
| 1240 |
else:
|
| 1241 |
-
# use blended but keep sort_by if chosen
|
| 1242 |
if sort_by in results.columns:
|
| 1243 |
results = results.sort_values([sort_by,"_blend"], ascending=[(sort_dir=="asc"), False])
|
| 1244 |
else:
|
|
|
|
| 103 |
# Common personal mail domains (used with user-specified trusted org domains)
|
| 104 |
PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
|
| 105 |
|
| 106 |
+
# Newsletter/newswire heuristics
|
| 107 |
+
NEWS_DOMAINS = {"nytimes.com","ft.com","wsj.com","bloomberg.com","reuters.com","theguardian.com","economist.com"}
|
| 108 |
+
def is_news_like(subject: str, body: str, from_domain: str) -> bool:
|
| 109 |
+
s = (subject or "").lower()
|
| 110 |
+
b = (body or "").lower()
|
| 111 |
+
fd = (from_domain or "").lower()
|
| 112 |
+
if "unsubscribe" in b or "manage preferences" in b: return True
|
| 113 |
+
if any(k in s for k in ["daily briefing","morning update","newsletter","top stories"]): return True
|
| 114 |
+
if any(d in fd for d in NEWS_DOMAINS): return True
|
| 115 |
+
return False
|
| 116 |
+
|
| 117 |
# Optional seeded themes for semi-supervised init (used only when LSA is ON)
|
| 118 |
CORR_LEX = {
|
| 119 |
"kickback" : ["kickback","bribe","under the table","gift","cash"],
|
|
|
|
| 122 |
"money_flow" : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
|
| 123 |
}
|
| 124 |
|
| 125 |
+
# =================== Label cleanup helpers ===================
|
| 126 |
EN_STOP = {
|
| 127 |
"the","of","and","to","in","is","for","on","at","with","from","by","or","as",
|
| 128 |
"that","this","it","be","are","was","were","an","a","you","your","we","our","us",
|
|
|
|
| 533 |
if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
|
| 534 |
if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
|
| 535 |
if OFFCHANNEL_RE.search(t): tokens.append("__OFF_CHANNEL__")
|
| 536 |
+
lang_tok = f'__LANG_{(row.get("lang") or "unk").lower()}__'
|
| 537 |
+
tokens.append(lang_tok)
|
| 538 |
return (t + " " + " ".join(tokens)).strip()
|
| 539 |
|
| 540 |
+
# =================== Cluster labeling: improved PMI + class-TFIDF ===================
|
| 541 |
def cluster_labels_pmi_bigram(texts, labels, topn=6):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
import math as _math
|
| 543 |
+
from collections import Counter, defaultdict
|
| 544 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 545 |
+
|
| 546 |
+
HEADER_STOP = {"subject","re","fw","fwd","to","cc","bcc","from","sent","forwarded","回复","主题","收件人","发件人"}
|
| 547 |
+
|
| 548 |
+
def is_junk_token(tok: str) -> bool:
|
| 549 |
+
if _is_junk_term(tok): return True
|
| 550 |
+
tl = tok.lower()
|
| 551 |
+
if "@" in tl: return True
|
| 552 |
+
if tl in HEADER_STOP: return True
|
| 553 |
+
if re.search(r"[^\w\-']", tl): # punctuation blobs
|
| 554 |
+
if "’" not in tl and "'" not in tl:
|
| 555 |
+
return True
|
| 556 |
+
return False
|
| 557 |
+
|
| 558 |
+
def tokenize_clean(t):
|
| 559 |
+
toks = re.findall(TOKEN_PATTERN, t.lower())
|
| 560 |
+
return [w for w in toks if not is_junk_token(w)]
|
| 561 |
+
|
| 562 |
+
def bigrams(toks):
|
| 563 |
+
return [" ".join(p) for p in zip(toks, toks[1:]) if all(not is_junk_token(x) for x in p)]
|
| 564 |
+
|
| 565 |
glob_bg = Counter()
|
| 566 |
+
per_c_bg = defaultdict(Counter)
|
| 567 |
+
per_c_texts = defaultdict(list)
|
| 568 |
+
|
| 569 |
+
for txt, c in zip(texts, labels):
|
| 570 |
+
toks = tokenize_clean(txt)
|
| 571 |
+
bgs = set(bigrams(toks))
|
| 572 |
glob_bg.update(bgs)
|
| 573 |
+
per_c_bg[int(c)].update(bgs)
|
| 574 |
+
per_c_texts[int(c)].append(" ".join(toks))
|
| 575 |
+
|
| 576 |
labels_out = {}
|
| 577 |
+
total_bg = sum(glob_bg.values()) + 1e-12
|
| 578 |
+
|
| 579 |
+
for c in sorted(set(int(x) for x in labels)):
|
| 580 |
+
# PMI bigrams
|
| 581 |
scores = []
|
| 582 |
+
total_c = sum(per_c_bg[c].values()) + 1e-12
|
| 583 |
+
for bg, cnt in per_c_bg[c].most_common(2000):
|
| 584 |
p_bg_c = cnt / total_c
|
| 585 |
p_bg = (glob_bg[bg] / total_bg)
|
| 586 |
if p_bg > 0 and p_bg_c > 0:
|
| 587 |
score = _math.log(p_bg_c) - _math.log(p_bg)
|
| 588 |
scores.append((score, bg))
|
| 589 |
scores.sort(reverse=True)
|
| 590 |
+
top_bi = [bg for _, bg in scores[: max(2, topn//2) ]]
|
| 591 |
+
|
| 592 |
+
# class-TFIDF unigrams (cluster doc vs. background doc)
|
| 593 |
+
docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
|
| 594 |
+
docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k!=c), [])) or " "]
|
| 595 |
+
corpus = [docs_c[0], docs_bg[0]]
|
| 596 |
+
vec = TfidfVectorizer(
|
| 597 |
+
analyzer="word", ngram_range=(1,1),
|
| 598 |
+
max_features=3000, token_pattern=TOKEN_PATTERN, lowercase=True
|
| 599 |
+
)
|
| 600 |
+
X = vec.fit_transform(corpus)
|
| 601 |
+
vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
|
| 602 |
+
row = X[0].toarray().ravel()
|
| 603 |
+
top_idx = row.argsort()[::-1][: max(0, topn - len(top_bi)) ]
|
| 604 |
+
top_uni = [t for t in vocab[top_idx] if not is_junk_token(t)][: max(0, topn - len(top_bi)) ]
|
| 605 |
+
|
| 606 |
+
parts = top_bi + top_uni
|
| 607 |
+
labels_out[c] = ", ".join(parts) if parts else f"cluster_{c}"
|
| 608 |
+
|
| 609 |
return labels_out
|
| 610 |
|
| 611 |
# =================== Auto-k & merge ===================
|
|
|
|
| 760 |
with gr.Accordion("Vectorization & Clustering", open=True):
|
| 761 |
with gr.Row():
|
| 762 |
max_features = gr.Number(label="Word max_features (BM25)", value=120_000, precision=0)
|
| 763 |
+
min_df = gr.Number(label="min_df (doc freq ≥)", value=3, precision=0) # tightened default
|
| 764 |
max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
|
| 765 |
use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
|
| 766 |
skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
|
| 767 |
with gr.Row():
|
| 768 |
use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
|
| 769 |
+
lsa_dim = gr.Number(label="LSA components", value=256, precision=0) # richer default
|
| 770 |
auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
|
| 771 |
k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
|
| 772 |
mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
|
|
|
|
| 931 |
# trusted org domains
|
| 932 |
trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
|
| 933 |
extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
|
| 934 |
+
# extend phrases at runtime
|
| 935 |
extra_terms_lower = [t.lower() for t in extra_terms]
|
| 936 |
|
| 937 |
recs = _load_json_records(inbox_file.name)
|
|
|
|
| 974 |
flags.append(f)
|
| 975 |
df["flags"] = flags
|
| 976 |
|
| 977 |
+
# Identify news-like messages and separate them out before clustering
|
| 978 |
+
df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject",""), r.get("body_text",""), r.get("from_domain","")), axis=1)
|
| 979 |
+
df_main = df[~df["is_news"]].reset_index(drop=True)
|
| 980 |
+
df_news = df[df["is_news"]].reset_index(drop=True)
|
| 981 |
+
|
| 982 |
+
# Enriched texts (adds __HAS_*__ flags + __LANG__)
|
| 983 |
+
texts = list(df_main.apply(enrich_text, axis=1))
|
| 984 |
|
| 985 |
# === Vectorization ===
|
| 986 |
ngram_range = (1, 2) if use_bigrams else (1, 1)
|
|
|
|
| 1004 |
)
|
| 1005 |
X_char = char_vec.fit_transform(texts)
|
| 1006 |
|
| 1007 |
+
# Down-weight char-grams so they don't dominate geometry
|
| 1008 |
+
X_full = hstack([X_word, X_char * 0.4], format="csr")
|
| 1009 |
d_word = X_word.shape[1]
|
| 1010 |
d_char = X_char.shape[1]
|
| 1011 |
d_full = X_full.shape[1]
|
|
|
|
| 1024 |
gc.collect()
|
| 1025 |
|
| 1026 |
# Optional anomaly detection (on LSA space)
|
| 1027 |
+
anomaly_scores = np.full((len(df_main),), np.nan, dtype=np.float32)
|
| 1028 |
if use_lsa and bool(use_iso) and ISO_OK and X_reduced is not None and X_reduced.shape[0] >= 50:
|
| 1029 |
try:
|
| 1030 |
iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
|
| 1031 |
iso.fit(X_reduced)
|
| 1032 |
+
anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32) # higher = more anomalous
|
|
|
|
| 1033 |
except Exception:
|
| 1034 |
pass
|
|
|
|
| 1035 |
|
| 1036 |
# K selection
|
| 1037 |
if bool(auto_k):
|
|
|
|
| 1063 |
)
|
| 1064 |
labels = kmeans.fit_predict(X_space)
|
| 1065 |
|
| 1066 |
+
# Merge very-similar clusters (LSA only) — slightly stricter
|
| 1067 |
if use_lsa:
|
| 1068 |
+
labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.94)
|
| 1069 |
|
| 1070 |
+
# Attach clustering back to df_main
|
| 1071 |
+
df_main["cluster_id"] = labels
|
|
|
|
| 1072 |
term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
|
| 1073 |
+
df_main["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
|
| 1074 |
+
df_main["anomaly_score"] = anomaly_scores
|
| 1075 |
+
|
| 1076 |
+
# Newsletter/newswire rows: assign a special cluster
|
| 1077 |
+
if len(df_news):
|
| 1078 |
+
df_news["cluster_id"] = -1
|
| 1079 |
+
df_news["cluster_name"] = "newsletter/news"
|
| 1080 |
+
df_news["anomaly_score"] = np.nan
|
| 1081 |
|
| 1082 |
+
# Combine back
|
| 1083 |
+
df = pd.concat([df_main, df_news], ignore_index=True)
|
| 1084 |
+
|
| 1085 |
+
# CorruptionScore (uses trusted domains)
|
| 1086 |
df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
|
| 1087 |
|
| 1088 |
+
# Build search index on clustered subset only
|
| 1089 |
use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
|
| 1090 |
index_obj = None
|
| 1091 |
if use_faiss:
|
|
|
|
| 1099 |
|
| 1100 |
# Summaries
|
| 1101 |
cluster_counts = (
|
| 1102 |
+
df[df["cluster_id"] != -1]
|
| 1103 |
+
.groupby(["cluster_id", "cluster_name"]).size()
|
| 1104 |
.reset_index(name="count")
|
| 1105 |
.sort_values("count", ascending=False)
|
| 1106 |
.head(500)
|
|
|
|
| 1108 |
cluster_counts["label"] = cluster_counts.apply(
|
| 1109 |
lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
|
| 1110 |
)
|
| 1111 |
+
# Optionally append newsletter bucket
|
| 1112 |
+
news_count = int((df["cluster_id"] == -1).sum())
|
| 1113 |
+
if news_count > 0:
|
| 1114 |
+
cluster_counts = pd.concat([
|
| 1115 |
+
cluster_counts,
|
| 1116 |
+
pd.DataFrame([{"cluster_id": -1, "cluster_name": "newsletter/news", "count": news_count,
|
| 1117 |
+
"label": f'-1 — newsletter/news ({news_count})'}])
|
| 1118 |
+
], ignore_index=True)
|
| 1119 |
+
|
| 1120 |
cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
|
| 1121 |
|
| 1122 |
domain_counts = (
|
|
|
|
| 1160 |
|
| 1161 |
status_md = (
|
| 1162 |
f"**Processed {len(df):,} emails** \n"
|
| 1163 |
+
f"Word feats (BM25): {d_word:,} | Char feats: {d_char:,} (x0.4) | Total: {d_full:,} \n"
|
| 1164 |
f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
|
| 1165 |
f"k = {k} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'} | "
|
| 1166 |
f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
|
|
|
|
| 1219 |
elif by == "anomaly_score" and "anomaly_score" in tmp.columns:
|
| 1220 |
tmp = tmp.sort_values(["anomaly_score","_dt"], ascending=[asc, not asc])
|
| 1221 |
else:
|
|
|
|
| 1222 |
col = by if by in tmp.columns else "corruption_score"
|
| 1223 |
tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
|
| 1224 |
tmp = tmp.drop(columns=["_dt"])
|
|
|
|
| 1291 |
q_emb = q_vec_full
|
| 1292 |
|
| 1293 |
if isinstance(index_obj, NearestNeighbors):
|
| 1294 |
+
distances, indices = index_obj.kneighbors(q_emb, n_neighbors=min(50, len(df[df["cluster_id"]!=-1])))
|
| 1295 |
inds = indices[0]
|
| 1296 |
sims = 1.0 - distances[0]
|
| 1297 |
+
results = df[df["cluster_id"]!=-1].iloc[inds].copy()
|
| 1298 |
results["search_score"] = sims
|
| 1299 |
elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
|
| 1300 |
+
D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df[df["cluster_id"]!=-1])))
|
| 1301 |
inds = I[0]
|
| 1302 |
sims = D[0]
|
| 1303 |
+
results = df[df["cluster_id"]!=-1].iloc[inds].copy()
|
| 1304 |
results["search_score"] = sims
|
| 1305 |
else:
|
| 1306 |
return pd.DataFrame(), q_terms
|
|
|
|
| 1313 |
if sort_by == "search_score":
|
| 1314 |
results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
|
| 1315 |
else:
|
|
|
|
| 1316 |
if sort_by in results.columns:
|
| 1317 |
results = results.sort_values([sort_by,"_blend"], ascending=[(sort_dir=="asc"), False])
|
| 1318 |
else:
|