Update app.py
Browse files
app.py
CHANGED
|
@@ -25,6 +25,22 @@ from sklearn.preprocessing import Normalizer
|
|
| 25 |
from sklearn.preprocessing import normalize as sk_normalize
|
| 26 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
# Optional light anomaly detection
|
| 29 |
try:
|
| 30 |
from sklearn.ensemble import IsolationForest
|
|
@@ -571,6 +587,62 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
|
|
| 571 |
)
|
| 572 |
return html
|
| 573 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
# =================== Feature engineering (BM25 + char) ===================
|
| 575 |
class BM25Transformer:
|
| 576 |
def __init__(self, k1=1.2, b=0.75):
|
|
@@ -615,96 +687,117 @@ def enrich_text(row: pd.Series) -> str:
|
|
| 615 |
return (t + " " + " ".join(tokens)).strip()
|
| 616 |
|
| 617 |
# =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST ===================
|
| 618 |
-
def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alpha=0.75):
|
| 619 |
"""
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
`subjects`: list of subject strings aligned with `texts`
|
| 625 |
-
`subject_alpha`: weight added per token = alpha * coverage_in_subjects (0..1)
|
| 626 |
"""
|
| 627 |
import math as _math
|
| 628 |
from collections import Counter, defaultdict
|
| 629 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 630 |
|
| 631 |
-
HEADER_STOP = {"subject","re","fw","fwd","to","cc","bcc","from","sent","forwarded",
|
|
|
|
| 632 |
|
| 633 |
def is_junk_token(tok: str) -> bool:
|
| 634 |
if _is_junk_term(tok): return True
|
| 635 |
tl = tok.lower()
|
| 636 |
-
if tl.startswith("__"): return True
|
| 637 |
-
if tl in STOP_TERMS: return True
|
| 638 |
if tl in HEADER_STOP: return True
|
| 639 |
if "@" in tl: return True
|
| 640 |
-
# drop short ASCII like "eb/ys/yl"
|
| 641 |
if tl.isascii() and len(tl) <= 2: return True
|
| 642 |
-
|
| 643 |
-
if re.search(r"[^\w\-']", tl):
|
| 644 |
-
if "’" not in tl and "'" not in tl:
|
| 645 |
-
return True
|
| 646 |
return False
|
| 647 |
|
| 648 |
def tokenize_clean(t):
|
| 649 |
-
toks = re.findall(TOKEN_PATTERN, t.lower())
|
| 650 |
return [w for w in toks if not is_junk_token(w)]
|
| 651 |
|
| 652 |
-
def
|
| 653 |
-
return [" ".join(p) for p in zip(toks
|
| 654 |
|
| 655 |
-
|
|
|
|
|
|
|
|
|
|
| 656 |
per_c_bg = defaultdict(Counter)
|
|
|
|
| 657 |
per_c_texts = defaultdict(list)
|
| 658 |
per_c_doc_count = defaultdict(int)
|
| 659 |
-
|
| 660 |
-
# SUBJECT presence (unique tokens/bigrams per subject per doc)
|
| 661 |
per_c_subj_uni_docs = defaultdict(Counter)
|
| 662 |
per_c_subj_bg_docs = defaultdict(Counter)
|
|
|
|
| 663 |
|
| 664 |
have_subjects = subjects is not None and len(subjects) == len(texts)
|
| 665 |
|
|
|
|
| 666 |
for idx, (txt, c) in enumerate(zip(texts, labels)):
|
| 667 |
c = int(c)
|
| 668 |
toks = tokenize_clean(txt)
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
per_c_texts[c].append(" ".join(toks))
|
| 673 |
per_c_doc_count[c] += 1
|
| 674 |
-
|
| 675 |
if have_subjects:
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
|
|
|
|
|
|
|
|
|
| 682 |
labels_out = {}
|
| 683 |
-
|
|
|
|
|
|
|
|
|
|
| 684 |
|
| 685 |
for c in sorted(set(int(x) for x in labels)):
|
| 686 |
n_docs_c = max(1, per_c_doc_count[c])
|
| 687 |
|
| 688 |
-
# PMI bigrams
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
if
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 706 |
docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
|
| 707 |
-
docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k!=c), [])) or " "]
|
| 708 |
corpus = [docs_c[0], docs_bg[0]]
|
| 709 |
vec = TfidfVectorizer(
|
| 710 |
analyzer="word", ngram_range=(1,1),
|
|
@@ -714,30 +807,27 @@ def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alph
|
|
| 714 |
vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
|
| 715 |
row = X[0].toarray().ravel()
|
| 716 |
|
| 717 |
-
#
|
| 718 |
subj_cov = np.zeros_like(row)
|
| 719 |
if have_subjects:
|
| 720 |
vocab_index = {t:i for i,t in enumerate(vocab)}
|
| 721 |
for tok, cnt_docs in per_c_subj_uni_docs[c].items():
|
| 722 |
-
if tok in vocab_index:
|
| 723 |
-
subj_cov[vocab_index[tok]] = cnt_docs / n_docs_c
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
row_boosted[i] = row[i] + subject_alpha * float(subj_cov[i])
|
| 730 |
-
|
| 731 |
-
top_idx = row_boosted.argsort()[::-1][: max(0, topn - len(top_bi)) ]
|
| 732 |
-
top_uni = []
|
| 733 |
-
for i in top_idx:
|
| 734 |
tok = vocab[i]
|
| 735 |
-
if
|
| 736 |
-
|
| 737 |
-
|
|
|
|
|
|
|
| 738 |
break
|
| 739 |
|
| 740 |
-
parts =
|
| 741 |
labels_out[c] = ", ".join(parts) if parts else f"cluster_{c}"
|
| 742 |
|
| 743 |
return labels_out
|
|
@@ -898,12 +988,21 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 898 |
max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
|
| 899 |
use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
|
| 900 |
skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
|
|
|
|
|
|
|
|
|
|
| 901 |
with gr.Row():
|
| 902 |
use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
|
| 903 |
lsa_dim = gr.Number(label="LSA components", value=256, precision=0) # richer default
|
| 904 |
auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
|
| 905 |
k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
|
| 906 |
mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 907 |
with gr.Row():
|
| 908 |
use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
|
| 909 |
use_iso = gr.Checkbox(label="Compute anomaly score (IsolationForest on LSA)", value=False)
|
|
@@ -913,6 +1012,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 913 |
trusted_domains_in = gr.Textbox(label="Trusted org domains (comma-separated)", value="example.gov, example.org")
|
| 914 |
extra_keywords_in = gr.Textbox(label="Extra suspicious phrases (comma-separated)", value="")
|
| 915 |
highlight_toggle = gr.Checkbox(label="Highlight suspect patterns in reader", value=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 916 |
with gr.Row():
|
| 917 |
cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
|
| 918 |
domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
|
|
@@ -934,6 +1039,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 934 |
with gr.Row():
|
| 935 |
cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
|
| 936 |
domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
|
|
|
|
|
|
|
|
|
|
| 937 |
|
| 938 |
with gr.Row():
|
| 939 |
actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
|
|
@@ -1003,7 +1111,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1003 |
) -> pd.DataFrame:
|
| 1004 |
out = df
|
| 1005 |
if cluster and cluster != "(any)":
|
| 1006 |
-
m = re.match(r"^(\d+)\s+—", cluster)
|
| 1007 |
if m:
|
| 1008 |
cid = int(m.group(1))
|
| 1009 |
out = out[out["cluster_id"] == cid]
|
|
@@ -1054,13 +1162,182 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1054 |
# -------- Main pipeline --------
|
| 1055 |
def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 1056 |
use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
|
| 1057 |
-
trusted_domains_in, extra_keywords_in, highlight_toggle
|
|
|
|
|
|
|
|
|
|
| 1058 |
if inbox_file is None:
|
| 1059 |
return ("**Please upload a file.**",
|
| 1060 |
-
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 1061 |
None, None, None, None)
|
| 1062 |
|
| 1063 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1064 |
|
| 1065 |
# trusted org domains
|
| 1066 |
trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
|
|
@@ -1071,19 +1348,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1071 |
recs = _load_json_records(inbox_file.name)
|
| 1072 |
if not recs:
|
| 1073 |
return ("**No valid records found.**",
|
| 1074 |
-
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 1075 |
None, None, None, None)
|
| 1076 |
|
| 1077 |
# Normalize
|
| 1078 |
normd = []
|
| 1079 |
for r in tqdm(recs, desc="Normalize", leave=False):
|
| 1080 |
-
out = normalize_email_record(r, use_langdetect=
|
| 1081 |
if out and out.get("body_text") is not None:
|
| 1082 |
normd.append(out)
|
| 1083 |
df = pd.DataFrame(normd)
|
| 1084 |
if df.empty:
|
| 1085 |
return ("**No usable email records after normalization.**",
|
| 1086 |
-
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 1087 |
None, None, None, None)
|
| 1088 |
|
| 1089 |
# Deduplicate conservatively
|
|
@@ -1116,135 +1393,150 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1116 |
df_news = df[df["is_news"]].reset_index(drop=True)
|
| 1117 |
df_alerts = df[df["is_notify"]].reset_index(drop=True)
|
| 1118 |
|
| 1119 |
-
#
|
| 1120 |
-
|
| 1121 |
-
subjects_only = list(df_main["subject"].fillna(""))
|
| 1122 |
-
|
| 1123 |
-
# === Vectorization ===
|
| 1124 |
-
ngram_range = (1, 2) if use_bigrams else (1, 1)
|
| 1125 |
-
count_vec = CountVectorizer(
|
| 1126 |
-
analyzer="word",
|
| 1127 |
-
ngram_range=ngram_range,
|
| 1128 |
-
max_features=int(max_features) if max_features else None,
|
| 1129 |
-
min_df=int(min_df) if min_df else 2,
|
| 1130 |
-
max_df=float(max_df) if max_df else 0.7,
|
| 1131 |
-
token_pattern=TOKEN_PATTERN,
|
| 1132 |
-
lowercase=True,
|
| 1133 |
-
dtype=np.float32,
|
| 1134 |
-
stop_words=STOPWORD_FOR_VEC, # list (not set)
|
| 1135 |
-
)
|
| 1136 |
-
TF = count_vec.fit_transform(texts)
|
| 1137 |
-
bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
|
| 1138 |
-
X_word = bm25.transform(TF) # sparse BM25 word matrix
|
| 1139 |
-
|
| 1140 |
-
char_vec = CharTfidf(
|
| 1141 |
-
analyzer="char", ngram_range=(3,5), min_df=2, max_features=100_000,
|
| 1142 |
-
lowercase=True, dtype=np.float32
|
| 1143 |
-
)
|
| 1144 |
-
X_char = char_vec.fit_transform(texts)
|
| 1145 |
-
|
| 1146 |
-
# Down-weight char-grams so they don't dominate geometry
|
| 1147 |
-
X_full = hstack([X_word, X_char * 0.20], format="csr")
|
| 1148 |
-
d_word = X_word.shape[1]
|
| 1149 |
-
d_char = X_char.shape[1]
|
| 1150 |
-
d_full = X_full.shape[1]
|
| 1151 |
-
|
| 1152 |
-
# LSA
|
| 1153 |
-
use_lsa = bool(use_lsa)
|
| 1154 |
-
X_reduced = None
|
| 1155 |
-
svd_obj = None
|
| 1156 |
-
norm_obj = None
|
| 1157 |
-
if use_lsa:
|
| 1158 |
-
svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
|
| 1159 |
-
X_reduced_tmp = svd_obj.fit_transform(X_full) # dense
|
| 1160 |
-
norm_obj = Normalizer(copy=False)
|
| 1161 |
-
X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
|
| 1162 |
-
del X_reduced_tmp
|
| 1163 |
-
gc.collect()
|
| 1164 |
-
|
| 1165 |
-
# Optional anomaly detection (on LSA space)
|
| 1166 |
-
anomaly_scores = np.full((len(df_main),), np.nan, dtype=np.float32)
|
| 1167 |
-
if use_lsa and bool(use_iso) and ISO_OK and X_reduced is not None and X_reduced.shape[0] >= 50:
|
| 1168 |
-
try:
|
| 1169 |
-
iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
|
| 1170 |
-
iso.fit(X_reduced)
|
| 1171 |
-
anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32) # higher = more anomalous
|
| 1172 |
-
except Exception:
|
| 1173 |
-
pass
|
| 1174 |
|
| 1175 |
-
#
|
| 1176 |
-
|
| 1177 |
-
|
| 1178 |
-
|
| 1179 |
-
|
| 1180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1181 |
else:
|
| 1182 |
-
|
| 1183 |
-
|
| 1184 |
-
#
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
-
|
| 1188 |
-
|
| 1189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1190 |
)
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
-
|
| 1194 |
-
|
| 1195 |
-
|
| 1196 |
-
|
| 1197 |
-
|
| 1198 |
-
|
| 1199 |
-
|
| 1200 |
-
|
| 1201 |
-
|
| 1202 |
-
|
| 1203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1204 |
|
| 1205 |
-
|
| 1206 |
-
|
| 1207 |
-
|
|
|
|
|
|
|
|
|
|
| 1208 |
|
| 1209 |
-
|
| 1210 |
-
|
| 1211 |
-
|
| 1212 |
-
df_main["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
|
| 1213 |
-
df_main["anomaly_score"] = anomaly_scores
|
| 1214 |
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
-
df_news["cluster_name"] = "newsletter/news"
|
| 1219 |
-
df_news["anomaly_score"] = np.nan
|
| 1220 |
|
| 1221 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1222 |
if len(df_alerts):
|
| 1223 |
-
df_alerts["cluster_id"] = -2
|
| 1224 |
-
df_alerts["cluster_name"] = "system/alerts"
|
| 1225 |
-
df_alerts["anomaly_score"] = np.nan
|
| 1226 |
|
| 1227 |
# Combine back
|
| 1228 |
df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
|
| 1229 |
|
| 1230 |
-
#
|
| 1231 |
df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
|
| 1232 |
|
| 1233 |
-
# Build search index on
|
| 1234 |
-
use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (
|
| 1235 |
index_obj = None
|
| 1236 |
if use_faiss:
|
| 1237 |
-
d =
|
| 1238 |
-
index_obj = faiss.IndexFlatIP(d)
|
| 1239 |
-
index_obj.add(
|
| 1240 |
else:
|
| 1241 |
-
|
| 1242 |
-
|
| 1243 |
-
|
| 1244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1245 |
# Summaries
|
| 1246 |
cluster_counts = (
|
| 1247 |
-
df[
|
| 1248 |
.groupby(["cluster_id", "cluster_name"]).size()
|
| 1249 |
.reset_index(name="count")
|
| 1250 |
.sort_values("count", ascending=False)
|
|
@@ -1254,11 +1546,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1254 |
# Append buckets
|
| 1255 |
news_count = int((df["cluster_id"] == -1).sum())
|
| 1256 |
alerts_count = int((df["cluster_id"] == -2).sum())
|
|
|
|
| 1257 |
extra_rows = []
|
| 1258 |
if news_count > 0:
|
| 1259 |
extra_rows.append({"cluster_id": -1, "cluster_name": "newsletter/news", "count": news_count})
|
| 1260 |
if alerts_count > 0:
|
| 1261 |
extra_rows.append({"cluster_id": -2, "cluster_name": "system/alerts", "count": alerts_count})
|
|
|
|
|
|
|
| 1262 |
if extra_rows:
|
| 1263 |
cluster_counts = pd.concat([cluster_counts, pd.DataFrame(extra_rows)], ignore_index=True)
|
| 1264 |
|
|
@@ -1283,6 +1578,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1283 |
)
|
| 1284 |
sender_choices = ["(any)"] + sender_counts["from_email"].tolist()
|
| 1285 |
|
|
|
|
| 1286 |
# Languages present
|
| 1287 |
langs = [l for l in sorted(df["lang"].dropna().unique()) if l and l!="unknown"]
|
| 1288 |
lang_choices = ["(any)"] + langs
|
|
@@ -1304,13 +1600,21 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1304 |
cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
|
| 1305 |
out_table = show_df[cols_out].head(500)
|
| 1306 |
|
| 1307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1308 |
|
| 1309 |
status_md = (
|
| 1310 |
f"**Processed {len(df):,} emails** \n"
|
| 1311 |
-
f"Word feats
|
| 1312 |
-
f"{'LSA: ' + str(
|
| 1313 |
-
f"k = {
|
| 1314 |
f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
|
| 1315 |
)
|
| 1316 |
|
|
@@ -1320,27 +1624,36 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1320 |
domain_update = gr.update(choices=domain_choices, value="(any)")
|
| 1321 |
sender_update = gr.update(choices=sender_choices, value="(any)")
|
| 1322 |
lang_update = gr.update(choices=lang_choices, value="(any)")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1323 |
|
| 1324 |
return (
|
| 1325 |
status_md,
|
| 1326 |
-
cluster_counts, domain_counts,
|
| 1327 |
actors, offhours_table,
|
| 1328 |
out_table,
|
| 1329 |
-
df, vec_state, (
|
| 1330 |
-
use_lsa, bool(use_faiss),
|
| 1331 |
cluster_update, domain_update, sender_update, lang_update,
|
| 1332 |
-
|
| 1333 |
-
(
|
| 1334 |
extra_terms_lower, bool(highlight_toggle)
|
| 1335 |
)
|
| 1336 |
|
| 1337 |
(run_btn.click)(
|
| 1338 |
process_file,
|
| 1339 |
-
inputs=[
|
| 1340 |
-
|
| 1341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1342 |
outputs=[status,
|
| 1343 |
-
cluster_counts_df, domain_counts_df,
|
| 1344 |
actors_df, offhours_df,
|
| 1345 |
results_df,
|
| 1346 |
state_df, state_vec, state_X_reduced, state_index, state_term_names,
|
|
@@ -1371,6 +1684,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1371 |
tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
|
| 1372 |
tmp = tmp.drop(columns=["_dt"])
|
| 1373 |
cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
|
|
|
|
|
|
|
| 1374 |
acc = [c for c in cols_out if c in tmp.columns]
|
| 1375 |
return tmp[acc].head(500)
|
| 1376 |
|
|
@@ -1416,40 +1731,75 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1416 |
except Exception:
|
| 1417 |
return None
|
| 1418 |
|
| 1419 |
-
def _vectorize_query(q: str, vec_state: Dict[str, Any]):
|
| 1420 |
-
|
| 1421 |
-
|
| 1422 |
-
|
| 1423 |
-
|
| 1424 |
-
|
| 1425 |
-
|
| 1426 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1427 |
return q_full
|
| 1428 |
|
| 1429 |
def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
|
| 1430 |
if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
|
| 1431 |
return pd.DataFrame(), []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1432 |
q_terms = _tokenize_query(q)
|
| 1433 |
-
q_vec_full = _vectorize_query(q, vec_state)
|
| 1434 |
-
|
|
|
|
| 1435 |
q_emb = _project_query_to_lsa(q_vec_full, svd_obj, norm_obj)
|
| 1436 |
if q_emb is None:
|
| 1437 |
return pd.DataFrame(), q_terms
|
| 1438 |
else:
|
| 1439 |
q_emb = q_vec_full
|
| 1440 |
-
|
| 1441 |
-
|
| 1442 |
-
|
| 1443 |
-
|
| 1444 |
-
|
| 1445 |
if isinstance(index_obj, NearestNeighbors):
|
| 1446 |
-
|
|
|
|
|
|
|
|
|
|
| 1447 |
inds = indices[0]
|
| 1448 |
sims = 1.0 - distances[0]
|
| 1449 |
results = filtered_df.iloc[inds].copy()
|
| 1450 |
results["search_score"] = sims
|
| 1451 |
elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
|
| 1452 |
-
D, I = index_obj.search(q_emb.astype(np.float32),
|
| 1453 |
inds = I[0]
|
| 1454 |
sims = D[0]
|
| 1455 |
results = filtered_df.iloc[inds].copy()
|
|
@@ -1459,8 +1809,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1459 |
|
| 1460 |
# blend with corruption score lightly
|
| 1461 |
cs = results["corruption_score"].fillna(0.0)
|
| 1462 |
-
|
| 1463 |
-
results["_blend"] = 0.7*results["search_score"].values + 0.3*
|
| 1464 |
# sort UI-selected way
|
| 1465 |
if sort_by == "search_score":
|
| 1466 |
results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
|
|
@@ -1471,7 +1821,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1471 |
results = results.sort_values("_blend", ascending=(sort_dir=="asc"))
|
| 1472 |
results = results.drop(columns=["_blend"])
|
| 1473 |
cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score", "search_score"]
|
| 1474 |
-
return results
|
| 1475 |
|
| 1476 |
search_btn.click(
|
| 1477 |
search_fn,
|
|
@@ -1499,12 +1849,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1499 |
if dstr is not None:
|
| 1500 |
cand = cand[cand["date"] == dstr]
|
| 1501 |
if len(cand) == 0:
|
|
|
|
| 1502 |
cand = df[df["subject"] == sel.get("subject", "")]
|
| 1503 |
if len(cand) == 0:
|
| 1504 |
-
return ""
|
| 1505 |
row = cand.iloc[0]
|
| 1506 |
-
cid = int(row.get("cluster_id", -
|
| 1507 |
-
clabel = term_names.get(cid,
|
| 1508 |
return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel, do_highlight=bool(do_highlight), extra_terms=extra_terms)
|
| 1509 |
|
| 1510 |
results_df.select(
|
|
@@ -1523,6 +1874,17 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1523 |
return "(any)"
|
| 1524 |
label = df_sum.iloc[row_idx]["label"]
|
| 1525 |
return label if isinstance(label, str) else "(any)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1526 |
|
| 1527 |
cluster_counts_df.select(
|
| 1528 |
on_cluster_click,
|
|
@@ -1533,6 +1895,31 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1533 |
inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
|
| 1534 |
outputs=[results_df]
|
| 1535 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1536 |
|
| 1537 |
if __name__ == "__main__":
|
| 1538 |
-
demo.launch()
|
|
|
|
| 25 |
from sklearn.preprocessing import normalize as sk_normalize
|
| 26 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 27 |
|
| 28 |
+
# === NEW / UPDATED IMPORTS ===
|
| 29 |
+
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer # NEW
|
| 30 |
+
from scipy.sparse import csr_matrix # NEW (to mix dense embeddings with sparse)
|
| 31 |
+
try:
|
| 32 |
+
import hdbscan # OPTIONAL (pip install hdbscan)
|
| 33 |
+
HDBSCAN_OK = True
|
| 34 |
+
except Exception:
|
| 35 |
+
HDBSCAN_OK = False
|
| 36 |
+
|
| 37 |
+
# Optional tiny/fast word vectors via Gensim (local .txt/.vec/.bin)
|
| 38 |
+
try:
|
| 39 |
+
from gensim.models import KeyedVectors # OPTIONAL
|
| 40 |
+
GENSIM_OK = True
|
| 41 |
+
except Exception:
|
| 42 |
+
GENSIM_OK = False
|
| 43 |
+
|
| 44 |
# Optional light anomaly detection
|
| 45 |
try:
|
| 46 |
from sklearn.ensemble import IsolationForest
|
|
|
|
| 587 |
)
|
| 588 |
return html
|
| 589 |
|
| 590 |
+
# ---------- Lightweight Embedding Utilities (Optional) ----------
|
| 591 |
+
def _load_embeddings(emb_path: str, binary: bool):
|
| 592 |
+
"""
|
| 593 |
+
Load word vectors with Gensim if available.
|
| 594 |
+
Accepts word2vec binary (.bin) or text formats (.txt/.vec).
|
| 595 |
+
Returns (model, dim) or (None, 0) if not available.
|
| 596 |
+
"""
|
| 597 |
+
if not GENSIM_OK or not emb_path or not os.path.exists(emb_path):
|
| 598 |
+
return None, 0
|
| 599 |
+
try:
|
| 600 |
+
if binary:
|
| 601 |
+
kv = KeyedVectors.load_word2vec_format(emb_path, binary=True)
|
| 602 |
+
else:
|
| 603 |
+
kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=False)
|
| 604 |
+
return kv, int(kv.vector_size)
|
| 605 |
+
except Exception:
|
| 606 |
+
# Attempt GloVe-like with headerless text
|
| 607 |
+
try:
|
| 608 |
+
kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=True)
|
| 609 |
+
return kv, int(kv.vector_size)
|
| 610 |
+
except Exception:
|
| 611 |
+
return None, 0
|
| 612 |
+
|
| 613 |
+
def _avg_embed_for_text(text: str, kv, dim: int) -> np.ndarray:
|
| 614 |
+
"""
|
| 615 |
+
Average embeddings over tokens matched by TOKEN_PATTERN.
|
| 616 |
+
Returns zero vector if nothing matches or kv is None.
|
| 617 |
+
"""
|
| 618 |
+
vec = np.zeros((dim,), dtype=np.float32)
|
| 619 |
+
if not kv or not text:
|
| 620 |
+
return vec
|
| 621 |
+
toks = re.findall(TOKEN_PATTERN, text.lower())
|
| 622 |
+
cnt = 0
|
| 623 |
+
for t in toks:
|
| 624 |
+
if t in kv:
|
| 625 |
+
vec += kv[t]
|
| 626 |
+
cnt += 1
|
| 627 |
+
if cnt > 0:
|
| 628 |
+
vec /= float(cnt)
|
| 629 |
+
# L2-normalize
|
| 630 |
+
n = np.linalg.norm(vec)
|
| 631 |
+
if n > 0:
|
| 632 |
+
vec /= n
|
| 633 |
+
return vec
|
| 634 |
+
|
| 635 |
+
def _build_doc_embeddings(texts: List[str], kv, dim: int) -> np.ndarray:
|
| 636 |
+
"""
|
| 637 |
+
Build [n_docs, dim] dense matrix of averaged embeddings.
|
| 638 |
+
"""
|
| 639 |
+
if not kv or dim <= 0:
|
| 640 |
+
return np.zeros((len(texts), 0), dtype=np.float32)
|
| 641 |
+
out = np.zeros((len(texts), dim), dtype=np.float32)
|
| 642 |
+
for i, t in enumerate(texts):
|
| 643 |
+
out[i, :] = _avg_embed_for_text(t or "", kv, dim)
|
| 644 |
+
return out
|
| 645 |
+
|
| 646 |
# =================== Feature engineering (BM25 + char) ===================
|
| 647 |
class BM25Transformer:
|
| 648 |
def __init__(self, k1=1.2, b=0.75):
|
|
|
|
| 687 |
return (t + " " + " ".join(tokens)).strip()
|
| 688 |
|
| 689 |
# =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST ===================
|
| 690 |
+
def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alpha=0.75, global_ubiq_cut=0.20):
|
| 691 |
"""
|
| 692 |
+
Improved labeler:
|
| 693 |
+
- Considers bigrams AND trigrams (PMI vs. global)
|
| 694 |
+
- Class-TFIDF unigrams with subject coverage boost
|
| 695 |
+
- Suppresses globally ubiquitous tokens/phrases (appear in >20% docs by default)
|
|
|
|
|
|
|
| 696 |
"""
|
| 697 |
import math as _math
|
| 698 |
from collections import Counter, defaultdict
|
| 699 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 700 |
|
| 701 |
+
HEADER_STOP = {"subject","re","fw","fwd","to","cc","bcc","from","sent","forwarded",
|
| 702 |
+
"回复","主题","收件人","发件人"}
|
| 703 |
|
| 704 |
def is_junk_token(tok: str) -> bool:
|
| 705 |
if _is_junk_term(tok): return True
|
| 706 |
tl = tok.lower()
|
| 707 |
+
if tl.startswith("__"): return True
|
| 708 |
+
if tl in STOP_TERMS: return True
|
| 709 |
if tl in HEADER_STOP: return True
|
| 710 |
if "@" in tl: return True
|
|
|
|
| 711 |
if tl.isascii() and len(tl) <= 2: return True
|
| 712 |
+
if re.search(r"[^\w\-']", tl) and "’" not in tl and "'" not in tl: return True
|
|
|
|
|
|
|
|
|
|
| 713 |
return False
|
| 714 |
|
| 715 |
def tokenize_clean(t):
|
| 716 |
+
toks = re.findall(TOKEN_PATTERN, (t or "").lower())
|
| 717 |
return [w for w in toks if not is_junk_token(w)]
|
| 718 |
|
| 719 |
+
def ngrams(toks, n):
|
| 720 |
+
return [" ".join(p) for p in zip(*[toks[i:] for i in range(n)]) if all(not is_junk_token(x) for x in p)]
|
| 721 |
|
| 722 |
+
# Compute global doc frequency for tokens, bigrams, trigrams
|
| 723 |
+
glob_df_uni = Counter()
|
| 724 |
+
glob_df_bg = Counter()
|
| 725 |
+
glob_df_tri = Counter()
|
| 726 |
per_c_bg = defaultdict(Counter)
|
| 727 |
+
per_c_tri = defaultdict(Counter)
|
| 728 |
per_c_texts = defaultdict(list)
|
| 729 |
per_c_doc_count = defaultdict(int)
|
|
|
|
|
|
|
| 730 |
per_c_subj_uni_docs = defaultdict(Counter)
|
| 731 |
per_c_subj_bg_docs = defaultdict(Counter)
|
| 732 |
+
per_c_subj_tri_docs = defaultdict(Counter)
|
| 733 |
|
| 734 |
have_subjects = subjects is not None and len(subjects) == len(texts)
|
| 735 |
|
| 736 |
+
# Pre-pass: DF stats
|
| 737 |
for idx, (txt, c) in enumerate(zip(texts, labels)):
|
| 738 |
c = int(c)
|
| 739 |
toks = tokenize_clean(txt)
|
| 740 |
+
uni_set = set(toks)
|
| 741 |
+
bg_set = set(ngrams(toks, 2))
|
| 742 |
+
tri_set = set(ngrams(toks, 3))
|
| 743 |
+
# DF
|
| 744 |
+
glob_df_uni.update(uni_set)
|
| 745 |
+
glob_df_bg.update(bg_set)
|
| 746 |
+
glob_df_tri.update(tri_set)
|
| 747 |
+
# Per-cluster counts
|
| 748 |
+
per_c_bg[c].update(bg_set)
|
| 749 |
+
per_c_tri[c].update(tri_set)
|
| 750 |
per_c_texts[c].append(" ".join(toks))
|
| 751 |
per_c_doc_count[c] += 1
|
| 752 |
+
# Subject presence
|
| 753 |
if have_subjects:
|
| 754 |
+
stoks = tokenize_clean(subjects[idx] or "")
|
| 755 |
+
s_uni = set(stoks)
|
| 756 |
+
s_bg = set(ngrams(stoks, 2))
|
| 757 |
+
s_tri = set(ngrams(stoks, 3))
|
| 758 |
+
per_c_subj_uni_docs[c].update(s_uni)
|
| 759 |
+
per_c_subj_bg_docs[c].update(s_bg)
|
| 760 |
+
per_c_subj_tri_docs[c].update(s_tri)
|
| 761 |
+
|
| 762 |
+
N = max(1, len(texts))
|
| 763 |
labels_out = {}
|
| 764 |
+
|
| 765 |
+
# Helper: ubiquity filter
|
| 766 |
+
def too_ubiquitous(df_count): # fraction of docs
|
| 767 |
+
return (df_count / float(N)) > float(global_ubiq_cut)
|
| 768 |
|
| 769 |
for c in sorted(set(int(x) for x in labels)):
|
| 770 |
n_docs_c = max(1, per_c_doc_count[c])
|
| 771 |
|
| 772 |
+
# PMI bigrams & trigrams with subject-coverage boost
|
| 773 |
+
phrases = []
|
| 774 |
+
for store, glob_df, subj_docs, n in (
|
| 775 |
+
(per_c_bg[c], glob_df_bg, per_c_subj_bg_docs[c], 2),
|
| 776 |
+
(per_c_tri[c], glob_df_tri, per_c_subj_tri_docs[c], 3),
|
| 777 |
+
):
|
| 778 |
+
total_c = sum(store.values()) + 1e-12
|
| 779 |
+
total_g = sum(glob_df.values()) + 1e-12
|
| 780 |
+
scored = []
|
| 781 |
+
for ng, cnt in store.most_common(3000):
|
| 782 |
+
if too_ubiquitous(glob_df[ng]): # skip ubiquitous n-grams
|
| 783 |
+
continue
|
| 784 |
+
p_ng_c = cnt / total_c
|
| 785 |
+
p_ng_g = (glob_df[ng] / total_g)
|
| 786 |
+
if p_ng_c > 0 and p_ng_g > 0:
|
| 787 |
+
score = _math.log(p_ng_c) - _math.log(p_ng_g)
|
| 788 |
+
cov = 0.0
|
| 789 |
+
if have_subjects:
|
| 790 |
+
cov = subj_docs[ng] / n_docs_c
|
| 791 |
+
score += subject_alpha * cov
|
| 792 |
+
scored.append((score, ng))
|
| 793 |
+
scored.sort(reverse=True)
|
| 794 |
+
# take a couple from each class to avoid only-bigrams/only-trigrams
|
| 795 |
+
take = max(1, topn // (3 if n == 3 else 2))
|
| 796 |
+
phrases.extend([p for _, p in scored[:take]])
|
| 797 |
+
|
| 798 |
+
# Class-TFIDF unigrams with subject coverage boost
|
| 799 |
docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
|
| 800 |
+
docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
|
| 801 |
corpus = [docs_c[0], docs_bg[0]]
|
| 802 |
vec = TfidfVectorizer(
|
| 803 |
analyzer="word", ngram_range=(1,1),
|
|
|
|
| 807 |
vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
|
| 808 |
row = X[0].toarray().ravel()
|
| 809 |
|
| 810 |
+
# Subject coverage vector
|
| 811 |
subj_cov = np.zeros_like(row)
|
| 812 |
if have_subjects:
|
| 813 |
vocab_index = {t:i for i,t in enumerate(vocab)}
|
| 814 |
for tok, cnt_docs in per_c_subj_uni_docs[c].items():
|
| 815 |
+
if tok in vocab_index and not is_junk_token(tok):
|
| 816 |
+
subj_cov[vocab_index[tok]] = cnt_docs / n_docs_c
|
| 817 |
+
|
| 818 |
+
row_boosted = row + subject_alpha * subj_cov
|
| 819 |
+
order = row_boosted.argsort()[::-1]
|
| 820 |
+
unis = []
|
| 821 |
+
for i in order:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 822 |
tok = vocab[i]
|
| 823 |
+
if is_junk_token(tok): continue
|
| 824 |
+
if too_ubiquitous(glob_df_uni.get(tok, 0)): # suppress ubiquitous tokens
|
| 825 |
+
continue
|
| 826 |
+
unis.append(tok)
|
| 827 |
+
if len(unis) >= max(0, topn - len(phrases)):
|
| 828 |
break
|
| 829 |
|
| 830 |
+
parts = (phrases + unis)[:max(2, topn)]
|
| 831 |
labels_out[c] = ", ".join(parts) if parts else f"cluster_{c}"
|
| 832 |
|
| 833 |
return labels_out
|
|
|
|
| 988 |
max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
|
| 989 |
use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
|
| 990 |
skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
|
| 991 |
+
with gr.Row():
|
| 992 |
+
use_hashing = gr.Checkbox(label="Use HashingVectorizer (memory-light, fast)", value=True) # NEW
|
| 993 |
+
hash_bits = gr.Slider(label="Hashing bits (2^n features)", minimum=16, maximum=20, step=1, value=18) # NEW
|
| 994 |
with gr.Row():
|
| 995 |
use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
|
| 996 |
lsa_dim = gr.Number(label="LSA components", value=256, precision=0) # richer default
|
| 997 |
auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
|
| 998 |
k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
|
| 999 |
mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
|
| 1000 |
+
with gr.Row():
|
| 1001 |
+
use_hdbscan = gr.Checkbox(label="Use HDBSCAN (auto-k, noise) on reduced vectors", value=False) # NEW
|
| 1002 |
+
hdb_min_cluster = gr.Number(label="HDBSCAN min_cluster_size", value=60, precision=0) # NEW
|
| 1003 |
+
hdb_min_samples = gr.Number(label="HDBSCAN min_samples (0=auto)", value=0, precision=0) # NEW
|
| 1004 |
+
with gr.Row():
|
| 1005 |
+
per_language = gr.Checkbox(label="Cluster per language (reduces cross-language mixing)", value=True) # NEW
|
| 1006 |
with gr.Row():
|
| 1007 |
use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
|
| 1008 |
use_iso = gr.Checkbox(label="Compute anomaly score (IsolationForest on LSA)", value=False)
|
|
|
|
| 1012 |
trusted_domains_in = gr.Textbox(label="Trusted org domains (comma-separated)", value="example.gov, example.org")
|
| 1013 |
extra_keywords_in = gr.Textbox(label="Extra suspicious phrases (comma-separated)", value="")
|
| 1014 |
highlight_toggle = gr.Checkbox(label="Highlight suspect patterns in reader", value=True)
|
| 1015 |
+
with gr.Row():
|
| 1016 |
+
use_embeddings = gr.Checkbox(label="Add lightweight word embeddings (avg word2vec/GloVe) if available", value=False) # NEW
|
| 1017 |
+
embed_weight = gr.Slider(label="Embedding weight in feature space", minimum=0.0, maximum=1.0, step=0.05, value=0.35) # NEW
|
| 1018 |
+
with gr.Row():
|
| 1019 |
+
embeddings_path = gr.Textbox(label="Path to local embeddings (.txt/.vec/.bin) (optional)", value="") # NEW
|
| 1020 |
+
embeddings_binary = gr.Checkbox(label="File is binary word2vec format", value=False) # NEW
|
| 1021 |
with gr.Row():
|
| 1022 |
cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
|
| 1023 |
domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
|
|
|
|
| 1039 |
with gr.Row():
|
| 1040 |
cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
|
| 1041 |
domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
|
| 1042 |
+
|
| 1043 |
+
with gr.Row():
|
| 1044 |
+
sender_counts_df = gr.Dataframe(label="Top senders", interactive=False, wrap=True) # NEW
|
| 1045 |
|
| 1046 |
with gr.Row():
|
| 1047 |
actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
|
|
|
|
| 1111 |
) -> pd.DataFrame:
|
| 1112 |
out = df
|
| 1113 |
if cluster and cluster != "(any)":
|
| 1114 |
+
m = re.match(r"^(\-?\d+)\s+—", cluster)
|
| 1115 |
if m:
|
| 1116 |
cid = int(m.group(1))
|
| 1117 |
out = out[out["cluster_id"] == cid]
|
|
|
|
| 1162 |
# -------- Main pipeline --------
|
| 1163 |
def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 1164 |
use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
|
| 1165 |
+
trusted_domains_in, extra_keywords_in, highlight_toggle,
|
| 1166 |
+
# NEW:
|
| 1167 |
+
use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
|
| 1168 |
+
per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary):
|
| 1169 |
if inbox_file is None:
|
| 1170 |
return ("**Please upload a file.**",
|
| 1171 |
+
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 1172 |
None, None, None, None)
|
| 1173 |
|
| 1174 |
+
# === Vectorization & Clustering (UPGRADED) ===
|
| 1175 |
+
def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
|
| 1176 |
+
# “texts” feed vectorizers; “subjects_only” helps labels
|
| 1177 |
+
texts = list(df_in.apply(enrich_text, axis=1))
|
| 1178 |
+
subjects_only = list(df_in["subject"].fillna(""))
|
| 1179 |
+
return texts, subjects_only
|
| 1180 |
+
|
| 1181 |
+
def _vectorize_block(
|
| 1182 |
+
texts: List[str],
|
| 1183 |
+
use_bigrams: bool,
|
| 1184 |
+
max_features: int,
|
| 1185 |
+
min_df: int,
|
| 1186 |
+
max_df: float,
|
| 1187 |
+
use_hashing: bool,
|
| 1188 |
+
hash_bits: int
|
| 1189 |
+
):
|
| 1190 |
+
"""
|
| 1191 |
+
Return (X_full_csr, count_vec, char_vec, bm25, d_word, d_char, d_full)
|
| 1192 |
+
Uses Count+BM25 (+ char-tfidf) or Hashing+TfidfTransformer (+ char-tfidf).
|
| 1193 |
+
"""
|
| 1194 |
+
ngram_range = (1, 2) if use_bigrams else (1, 1)
|
| 1195 |
+
|
| 1196 |
+
if use_hashing:
|
| 1197 |
+
# HashingVectorizer -> counts -> TfidfTransformer (as IDF) -> BM25-like not exact.
|
| 1198 |
+
# We’ll keep TF-IDF for words (IDF approximates) + char-grams TF-IDF (unchanged).
|
| 1199 |
+
hv = HashingVectorizer(
|
| 1200 |
+
analyzer="word",
|
| 1201 |
+
ngram_range=ngram_range,
|
| 1202 |
+
n_features=2 ** int(hash_bits),
|
| 1203 |
+
alternate_sign=False, # keep positivity
|
| 1204 |
+
token_pattern=TOKEN_PATTERN,
|
| 1205 |
+
lowercase=True,
|
| 1206 |
+
norm=None # raw counts first
|
| 1207 |
+
)
|
| 1208 |
+
word_counts = hv.transform(texts) # CSR
|
| 1209 |
+
# IDF (approximate TF-IDF since we lack exact DF per token str; good enough in practice here)
|
| 1210 |
+
tfidf_tr = TfidfTransformer()
|
| 1211 |
+
X_word = tfidf_tr.fit_transform(word_counts).astype(np.float32)
|
| 1212 |
+
|
| 1213 |
+
char_vec = CharTfidf(
|
| 1214 |
+
analyzer="char", ngram_range=(3, 5),
|
| 1215 |
+
min_df=2, max_features=100_000, lowercase=True, dtype=np.float32
|
| 1216 |
+
)
|
| 1217 |
+
X_char = char_vec.fit_transform(texts)
|
| 1218 |
+
X_full = hstack([X_word, X_char * 0.20], format="csr")
|
| 1219 |
+
d_word, d_char, d_full = X_word.shape[1], X_char.shape[1], X_word.shape[1] + X_char.shape[1]
|
| 1220 |
+
# For downstream code compatibility, expose “count_vec” and “bm25” placeholders
|
| 1221 |
+
count_vec = None
|
| 1222 |
+
bm25 = None
|
| 1223 |
+
return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full
|
| 1224 |
+
|
| 1225 |
+
# Original Count -> BM25 + char
|
| 1226 |
+
count_vec = CountVectorizer(
|
| 1227 |
+
analyzer="word", ngram_range=ngram_range,
|
| 1228 |
+
max_features=int(max_features) if max_features else None,
|
| 1229 |
+
min_df=int(min_df) if min_df else 2,
|
| 1230 |
+
max_df=float(max_df) if max_df else 0.7,
|
| 1231 |
+
token_pattern=TOKEN_PATTERN, lowercase=True,
|
| 1232 |
+
dtype=np.float32, stop_words=STOPWORD_FOR_VEC
|
| 1233 |
+
)
|
| 1234 |
+
TF = count_vec.fit_transform(texts)
|
| 1235 |
+
bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
|
| 1236 |
+
X_word = bm25.transform(TF)
|
| 1237 |
+
|
| 1238 |
+
char_vec = CharTfidf(
|
| 1239 |
+
analyzer="char", ngram_range=(3, 5),
|
| 1240 |
+
min_df=2, max_features=100_000, lowercase=True, dtype=np.float32
|
| 1241 |
+
)
|
| 1242 |
+
X_char = char_vec.fit_transform(texts)
|
| 1243 |
+
X_full = hstack([X_word, X_char * 0.20], format="csr")
|
| 1244 |
+
d_word, d_char, d_full = X_word.shape[1], X_char.shape[1], X_word.shape[1] + X_char.shape[1]
|
| 1245 |
+
return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full
|
| 1246 |
+
|
| 1247 |
+
def _reduce_space(X_full, use_lsa, lsa_dim):
|
| 1248 |
+
svd_obj = None
|
| 1249 |
+
norm_obj = None
|
| 1250 |
+
X_reduced = None
|
| 1251 |
+
if use_lsa:
|
| 1252 |
+
svd_obj = TruncatedSVD(n_components=int(lsa_dim or 256), random_state=0)
|
| 1253 |
+
Xtmp = svd_obj.fit_transform(X_full) # dense
|
| 1254 |
+
norm_obj = Normalizer(copy=False)
|
| 1255 |
+
X_reduced = norm_obj.fit_transform(Xtmp).astype(np.float32)
|
| 1256 |
+
del Xtmp; gc.collect()
|
| 1257 |
+
return X_reduced, svd_obj, norm_obj
|
| 1258 |
+
|
| 1259 |
+
def _attach_embeddings(texts, X_reduced_or_full, use_lsa, kv, emb_dim, weight):
|
| 1260 |
+
"""
|
| 1261 |
+
Concatenate averaged word vectors (dense) into the current space.
|
| 1262 |
+
We convert dense embeddings to CSR to safely hstack with CSR (if using non-LSA path).
|
| 1263 |
+
"""
|
| 1264 |
+
if kv is None or emb_dim <= 0 or weight <= 0.0:
|
| 1265 |
+
return X_reduced_or_full, emb_dim
|
| 1266 |
+
doc_embs = _build_doc_embeddings(texts, kv, emb_dim).astype(np.float32)
|
| 1267 |
+
if weight != 1.0:
|
| 1268 |
+
doc_embs *= float(weight)
|
| 1269 |
+
if isinstance(X_reduced_or_full, np.ndarray):
|
| 1270 |
+
# LSA path (dense): concat dense to dense
|
| 1271 |
+
return np.hstack([X_reduced_or_full, doc_embs]).astype(np.float32), emb_dim
|
| 1272 |
+
else:
|
| 1273 |
+
# Sparse path: convert embeddings to CSR and hstack
|
| 1274 |
+
X_emb = csr_matrix(doc_embs)
|
| 1275 |
+
return hstack([X_reduced_or_full, X_emb], format="csr"), emb_dim
|
| 1276 |
+
|
| 1277 |
+
def _cluster_space(
|
| 1278 |
+
X_space,
|
| 1279 |
+
df_part: pd.DataFrame,
|
| 1280 |
+
use_lsa: bool,
|
| 1281 |
+
use_hdbscan: bool,
|
| 1282 |
+
hdb_min_cluster: int,
|
| 1283 |
+
hdb_min_samples: int,
|
| 1284 |
+
auto_k: bool,
|
| 1285 |
+
k_clusters: int,
|
| 1286 |
+
mb_batch: int,
|
| 1287 |
+
count_vec,
|
| 1288 |
+
svd_obj,
|
| 1289 |
+
norm_obj,
|
| 1290 |
+
d_word, d_char
|
| 1291 |
+
):
|
| 1292 |
+
"""
|
| 1293 |
+
Run HDBSCAN (if requested and available) or MiniBatchKMeans.
|
| 1294 |
+
Return: labels (np.array), centers (np.ndarray) or None if HDBSCAN, and chosen_k.
|
| 1295 |
+
"""
|
| 1296 |
+
if use_hdbscan and HDBSCAN_OK and isinstance(X_space, np.ndarray) and X_space.shape[0] >= max(50, hdb_min_cluster):
|
| 1297 |
+
# HDBSCAN on LSA (dense) only
|
| 1298 |
+
min_samples = None if int(hdb_min_samples or 0) <= 0 else int(hdb_min_samples)
|
| 1299 |
+
clusterer = hdbscan.HDBSCAN(
|
| 1300 |
+
min_cluster_size=int(hdb_min_cluster or 60),
|
| 1301 |
+
min_samples=min_samples,
|
| 1302 |
+
metric='euclidean', # cosine≈euclidean on L2-normalized vectors
|
| 1303 |
+
cluster_selection_epsilon=0.0,
|
| 1304 |
+
core_dist_n_jobs=1 # CPU-friendly on HF
|
| 1305 |
+
)
|
| 1306 |
+
labels = clusterer.fit_predict(X_space)
|
| 1307 |
+
centers = None
|
| 1308 |
+
chosen_k = int(len(set([l for l in labels if l >= 0])))
|
| 1309 |
+
return labels, centers, chosen_k
|
| 1310 |
+
|
| 1311 |
+
# Otherwise MiniBatchKMeans (supports dense or sparse)
|
| 1312 |
+
if bool(auto_k):
|
| 1313 |
+
if use_lsa and isinstance(X_space, np.ndarray):
|
| 1314 |
+
k, _ = choose_k_by_kneedle(X_space, ks=(50,100,150,200,300,400,500))
|
| 1315 |
+
else:
|
| 1316 |
+
k = auto_k_rule(X_space.shape[0])
|
| 1317 |
+
else:
|
| 1318 |
+
k = max(10, int(k_clusters or 350))
|
| 1319 |
+
|
| 1320 |
+
init = None
|
| 1321 |
+
if use_lsa and isinstance(X_space, np.ndarray) and count_vec is not None:
|
| 1322 |
+
seeds = seeded_centroids_in_lsa(
|
| 1323 |
+
CORR_LEX, count_vec, svd_obj.components_, norm_obj,
|
| 1324 |
+
d_word=d_word, d_full=(d_word + d_char), k=k
|
| 1325 |
+
)
|
| 1326 |
+
if seeds is not None and seeds.shape[0] == k:
|
| 1327 |
+
init = seeds
|
| 1328 |
+
|
| 1329 |
+
kmeans = MiniBatchKMeans(
|
| 1330 |
+
n_clusters=k, batch_size=int(mb_batch or 4096),
|
| 1331 |
+
random_state=0, n_init="auto" if init is None else 1,
|
| 1332 |
+
init="k-means++" if init is None else init
|
| 1333 |
+
)
|
| 1334 |
+
labels = kmeans.fit_predict(X_space)
|
| 1335 |
+
centers = kmeans.cluster_centers_ if hasattr(kmeans, "cluster_centers_") else None
|
| 1336 |
+
if use_lsa and centers is not None:
|
| 1337 |
+
labels = merge_close_clusters(labels, centers, thresh=0.95)
|
| 1338 |
+
chosen_k = int(len(set(labels)))
|
| 1339 |
+
return labels, centers, chosen_k
|
| 1340 |
+
|
| 1341 |
|
| 1342 |
# trusted org domains
|
| 1343 |
trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
|
|
|
|
| 1348 |
recs = _load_json_records(inbox_file.name)
|
| 1349 |
if not recs:
|
| 1350 |
return ("**No valid records found.**",
|
| 1351 |
+
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 1352 |
None, None, None, None)
|
| 1353 |
|
| 1354 |
# Normalize
|
| 1355 |
normd = []
|
| 1356 |
for r in tqdm(recs, desc="Normalize", leave=False):
|
| 1357 |
+
out = normalize_email_record(r, use_langdetect=(not bool(skip_lang)))
|
| 1358 |
if out and out.get("body_text") is not None:
|
| 1359 |
normd.append(out)
|
| 1360 |
df = pd.DataFrame(normd)
|
| 1361 |
if df.empty:
|
| 1362 |
return ("**No usable email records after normalization.**",
|
| 1363 |
+
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 1364 |
None, None, None, None)
|
| 1365 |
|
| 1366 |
# Deduplicate conservatively
|
|
|
|
| 1393 |
df_news = df[df["is_news"]].reset_index(drop=True)
|
| 1394 |
df_alerts = df[df["is_notify"]].reset_index(drop=True)
|
| 1395 |
|
| 1396 |
+
# ----- Build texts (and optionally partition by language) -----
|
| 1397 |
+
use_lang = not bool(skip_lang)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1398 |
|
| 1399 |
+
# Optional embeddings
|
| 1400 |
+
kv = None; emb_dim = 0
|
| 1401 |
+
if bool(use_embeddings):
|
| 1402 |
+
kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))
|
| 1403 |
+
|
| 1404 |
+
# Optional per-language partitioning to avoid cross-language mixing
|
| 1405 |
+
parts = []
|
| 1406 |
+
if bool(per_language) and "lang" in df_main.columns:
|
| 1407 |
+
for lang_code, grp in df_main.groupby("lang"):
|
| 1408 |
+
if lang_code in (None, "", "unknown"):
|
| 1409 |
+
# keep unknown together
|
| 1410 |
+
parts.append(("unknown", grp.reset_index(drop=True)))
|
| 1411 |
+
else:
|
| 1412 |
+
parts.append((lang_code, grp.reset_index(drop=True)))
|
| 1413 |
else:
|
| 1414 |
+
parts = [("all", df_main.reset_index(drop=True))]
|
| 1415 |
+
|
| 1416 |
+
# Collect per-part results to concatenate
|
| 1417 |
+
labels_list = []
|
| 1418 |
+
cluster_name_list = []
|
| 1419 |
+
anomaly_list = []
|
| 1420 |
+
X_reduced_holder = None # only kept if use_lsa True and single-part; else None
|
| 1421 |
+
nn_index_obj = None # per overall (we build on all df_main at end)
|
| 1422 |
+
term_names_global = {}
|
| 1423 |
+
|
| 1424 |
+
# We'll build a global ANN/search index only if single partition & LSA on
|
| 1425 |
+
single_partition = (len(parts) == 1)
|
| 1426 |
+
d_word_agg = 0; d_char_agg=0;
|
| 1427 |
+
k_agg = 0
|
| 1428 |
+
|
| 1429 |
+
for p_lang, df_part in parts:
|
| 1430 |
+
if df_part.empty: continue
|
| 1431 |
+
texts, subjects_only = _make_texts(df_part)
|
| 1432 |
+
|
| 1433 |
+
# Vectorize
|
| 1434 |
+
X_full, count_vec, char_vec, bm25_local, d_word, d_char, d_full = _vectorize_block(
|
| 1435 |
+
texts=texts,
|
| 1436 |
+
use_bigrams=bool(use_bigrams),
|
| 1437 |
+
max_features=int(max_features or 120000),
|
| 1438 |
+
min_df=int(min_df or 3),
|
| 1439 |
+
max_df=float(max_df or 0.7),
|
| 1440 |
+
use_hashing=bool(use_hashing),
|
| 1441 |
+
hash_bits=int(hash_bits or 18)
|
| 1442 |
)
|
| 1443 |
+
d_word_agg += d_word; d_char_agg += d_char
|
| 1444 |
+
|
| 1445 |
+
# Dim reduction
|
| 1446 |
+
X_reduced, svd_obj_local, norm_obj_local = _reduce_space(X_full, bool(use_lsa), int(lsa_dim or 256))
|
| 1447 |
+
X_space = (X_reduced if X_reduced is not None else X_full)
|
| 1448 |
+
|
| 1449 |
+
# Optional embeddings concat
|
| 1450 |
+
if kv is not None and emb_dim > 0 and float(embed_weight or 0) > 0:
|
| 1451 |
+
X_space, _ = _attach_embeddings(texts, X_space, bool(use_lsa), kv, emb_dim, float(embed_weight))
|
| 1452 |
+
|
| 1453 |
+
# Optional anomaly (on LSA only, before clustering)
|
| 1454 |
+
anomaly_scores = np.full((len(df_part),), np.nan, dtype=np.float32)
|
| 1455 |
+
if X_reduced is not None and bool(use_iso) and ISO_OK and X_reduced.shape[0] >= 50:
|
| 1456 |
+
try:
|
| 1457 |
+
iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
|
| 1458 |
+
iso.fit(X_reduced)
|
| 1459 |
+
anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32)
|
| 1460 |
+
except Exception:
|
| 1461 |
+
pass
|
| 1462 |
+
|
| 1463 |
+
# Cluster (HDBSCAN or MiniBatchKMeans)
|
| 1464 |
+
labels, centers, chosen_k = _cluster_space(
|
| 1465 |
+
X_space=X_space, df_part=df_part, use_lsa=bool(use_lsa),
|
| 1466 |
+
use_hdbscan=bool(use_hdbscan), hdb_min_cluster=int(hdb_min_cluster or 60),
|
| 1467 |
+
hdb_min_samples=int(hdb_min_samples or 0),
|
| 1468 |
+
auto_k=bool(auto_k), k_clusters=int(k_clusters or 350), mb_batch=int(mb_batch or 4096),
|
| 1469 |
+
count_vec=count_vec, svd_obj=svd_obj_local, norm_obj=norm_obj_local,
|
| 1470 |
+
d_word=d_word, d_char=d_char
|
| 1471 |
+
)
|
| 1472 |
+
k_agg += chosen_k
|
| 1473 |
|
| 1474 |
+
# HDBSCAN yields -1 noise; keep as-is. KMeans may have merges above.
|
| 1475 |
+
# Labels -> cluster names
|
| 1476 |
+
term_names = cluster_labels_pmi_bigram(
|
| 1477 |
+
texts=texts, labels=labels, subjects=subjects_only, topn=6, subject_alpha=0.75, global_ubiq_cut=0.20
|
| 1478 |
+
)
|
| 1479 |
+
term_names_global.update({int(k): v for k, v in term_names.items()})
|
| 1480 |
|
| 1481 |
+
labels_list.append(pd.Series(labels, index=df_part.index))
|
| 1482 |
+
cluster_name_list.append(pd.Series([term_names.get(int(c), f"noise_{int(c)}" if c < 0 else f"cluster_{int(c)}") for c in labels], index=df_part.index))
|
| 1483 |
+
anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
|
|
|
|
|
|
|
| 1484 |
|
| 1485 |
+
# If single partition and LSA on, we’ll build ANN on that space after loop
|
| 1486 |
+
if single_partition and bool(use_lsa) and X_reduced is not None and X_reduced_holder is None:
|
| 1487 |
+
X_reduced_holder = X_reduced
|
|
|
|
|
|
|
| 1488 |
|
| 1489 |
+
# Stitch part results back to df_main order
|
| 1490 |
+
if not labels_list: # handle case where df_main was empty
|
| 1491 |
+
df_main["cluster_id"] = -10
|
| 1492 |
+
df_main["cluster_name"] = "unclustered"
|
| 1493 |
+
df_main["anomaly_score"] = np.nan
|
| 1494 |
+
else:
|
| 1495 |
+
df_main = df_main.copy()
|
| 1496 |
+
df_main["cluster_id"] = pd.concat(labels_list).sort_index()
|
| 1497 |
+
df_main["cluster_name"] = pd.concat(cluster_name_list).sort_index()
|
| 1498 |
+
df_main["anomaly_score"] = pd.concat(anomaly_list).sort_index()
|
| 1499 |
+
|
| 1500 |
+
# Keep your special buckets
|
| 1501 |
+
if len(df_news):
|
| 1502 |
+
df_news["cluster_id"] = -1; df_news["cluster_name"] = "newsletter/news"; df_news["anomaly_score"] = np.nan
|
| 1503 |
if len(df_alerts):
|
| 1504 |
+
df_alerts["cluster_id"] = -2; df_alerts["cluster_name"] = "system/alerts"; df_alerts["anomaly_score"] = np.nan
|
|
|
|
|
|
|
| 1505 |
|
| 1506 |
# Combine back
|
| 1507 |
df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)
|
| 1508 |
|
| 1509 |
+
# Corruption score (unchanged)
|
| 1510 |
df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
|
| 1511 |
|
| 1512 |
+
# --- Build search index (Faiss on LSA if single partition; else fallback to cosine brute) ---
|
| 1513 |
+
use_faiss = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
|
| 1514 |
index_obj = None
|
| 1515 |
if use_faiss:
|
| 1516 |
+
d = X_reduced_holder.shape[1]
|
| 1517 |
+
index_obj = faiss.IndexFlatIP(d)
|
| 1518 |
+
index_obj.add(X_reduced_holder)
|
| 1519 |
else:
|
| 1520 |
+
# fallback: brute cosine on feature space of df_main again (we need a space to fit)
|
| 1521 |
+
# For multi-part, just use corruption/search blending without ANN speedup (NearestNeighbors over dense LSA best-effort)
|
| 1522 |
+
try:
|
| 1523 |
+
if bool(use_lsa) and X_reduced_holder is not None and single_partition:
|
| 1524 |
+
nn = NearestNeighbors(metric="cosine", algorithm="brute")
|
| 1525 |
+
nn.fit(X_reduced_holder)
|
| 1526 |
+
index_obj = nn
|
| 1527 |
+
else:
|
| 1528 |
+
# If multi-part or non-LSA, do a minimal brute heuristic by re-vectorizing only for search when needed.
|
| 1529 |
+
# We'll defer exact vectors to search_fn’s projection.
|
| 1530 |
+
nn = NearestNeighbors(metric="cosine", algorithm="brute")
|
| 1531 |
+
# Fit on a tiny placeholder so object exists; we’ll guard in search.
|
| 1532 |
+
nn.fit(np.zeros((1, 4), dtype=np.float32))
|
| 1533 |
+
index_obj = nn
|
| 1534 |
+
except Exception:
|
| 1535 |
+
index_obj = None
|
| 1536 |
+
|
| 1537 |
# Summaries
|
| 1538 |
cluster_counts = (
|
| 1539 |
+
df[df["cluster_id"] >= 0]
|
| 1540 |
.groupby(["cluster_id", "cluster_name"]).size()
|
| 1541 |
.reset_index(name="count")
|
| 1542 |
.sort_values("count", ascending=False)
|
|
|
|
| 1546 |
# Append buckets
|
| 1547 |
news_count = int((df["cluster_id"] == -1).sum())
|
| 1548 |
alerts_count = int((df["cluster_id"] == -2).sum())
|
| 1549 |
+
hdbscan_noise = int((df["cluster_id"] < -2).sum())
|
| 1550 |
extra_rows = []
|
| 1551 |
if news_count > 0:
|
| 1552 |
extra_rows.append({"cluster_id": -1, "cluster_name": "newsletter/news", "count": news_count})
|
| 1553 |
if alerts_count > 0:
|
| 1554 |
extra_rows.append({"cluster_id": -2, "cluster_name": "system/alerts", "count": alerts_count})
|
| 1555 |
+
if hdbscan_noise > 0:
|
| 1556 |
+
extra_rows.append({"cluster_id": -3, "cluster_name": "HDBSCAN noise", "count": hdbscan_noise})
|
| 1557 |
if extra_rows:
|
| 1558 |
cluster_counts = pd.concat([cluster_counts, pd.DataFrame(extra_rows)], ignore_index=True)
|
| 1559 |
|
|
|
|
| 1578 |
)
|
| 1579 |
sender_choices = ["(any)"] + sender_counts["from_email"].tolist()
|
| 1580 |
|
| 1581 |
+
|
| 1582 |
# Languages present
|
| 1583 |
langs = [l for l in sorted(df["lang"].dropna().unique()) if l and l!="unknown"]
|
| 1584 |
lang_choices = ["(any)"] + langs
|
|
|
|
| 1600 |
cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
|
| 1601 |
out_table = show_df[cols_out].head(500)
|
| 1602 |
|
| 1603 |
+
# The vectorizer state is complex now with partitioning. For search, we need to rebuild it on the fly.
|
| 1604 |
+
# So we store the params to do so.
|
| 1605 |
+
vec_state = {
|
| 1606 |
+
"use_hashing": bool(use_hashing), "hash_bits": int(hash_bits),
|
| 1607 |
+
"max_features": int(max_features), "min_df": int(min_df), "max_df": float(max_df),
|
| 1608 |
+
"use_bigrams": bool(use_bigrams),
|
| 1609 |
+
# dummy objects for search fn compatibility
|
| 1610 |
+
"count_vec": None, "char_vec": None, "bm25": None
|
| 1611 |
+
}
|
| 1612 |
|
| 1613 |
status_md = (
|
| 1614 |
f"**Processed {len(df):,} emails** \n"
|
| 1615 |
+
f"Word feats: {d_word_agg:,} | Char feats: {d_char_agg:,} (x0.20) \n"
|
| 1616 |
+
f"{'LSA: ' + str(X_reduced_holder.shape[1]) + ' dims | ' if X_reduced_holder is not None else ''}"
|
| 1617 |
+
f"k = {k_agg} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'} | "
|
| 1618 |
f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
|
| 1619 |
)
|
| 1620 |
|
|
|
|
| 1624 |
domain_update = gr.update(choices=domain_choices, value="(any)")
|
| 1625 |
sender_update = gr.update(choices=sender_choices, value="(any)")
|
| 1626 |
lang_update = gr.update(choices=lang_choices, value="(any)")
|
| 1627 |
+
|
| 1628 |
+
# NOTE: svd_obj and norm_obj are now local to partitions. We can only pass back the single-partition one for search.
|
| 1629 |
+
svd_obj_out = svd_obj_local if single_partition and 'svd_obj_local' in locals() else None
|
| 1630 |
+
norm_obj_out = norm_obj_local if single_partition and 'norm_obj_local' in locals() else None
|
| 1631 |
|
| 1632 |
return (
|
| 1633 |
status_md,
|
| 1634 |
+
cluster_counts, domain_counts, sender_counts,
|
| 1635 |
actors, offhours_table,
|
| 1636 |
out_table,
|
| 1637 |
+
df, vec_state, (X_reduced_holder if bool(use_lsa) else None), index_obj, term_names_global,
|
| 1638 |
+
bool(use_lsa), bool(use_faiss),
|
| 1639 |
cluster_update, domain_update, sender_update, lang_update,
|
| 1640 |
+
svd_obj_out, norm_obj_out,
|
| 1641 |
+
(d_word_agg, d_char_agg),
|
| 1642 |
extra_terms_lower, bool(highlight_toggle)
|
| 1643 |
)
|
| 1644 |
|
| 1645 |
(run_btn.click)(
|
| 1646 |
process_file,
|
| 1647 |
+
inputs=[
|
| 1648 |
+
inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 1649 |
+
use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
|
| 1650 |
+
trusted_domains_in, extra_keywords_in, highlight_toggle,
|
| 1651 |
+
# NEW:
|
| 1652 |
+
use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
|
| 1653 |
+
per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary
|
| 1654 |
+
],
|
| 1655 |
outputs=[status,
|
| 1656 |
+
cluster_counts_df, domain_counts_df, sender_counts_df,
|
| 1657 |
actors_df, offhours_df,
|
| 1658 |
results_df,
|
| 1659 |
state_df, state_vec, state_X_reduced, state_index, state_term_names,
|
|
|
|
| 1684 |
tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
|
| 1685 |
tmp = tmp.drop(columns=["_dt"])
|
| 1686 |
cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
|
| 1687 |
+
if "search_score" in tmp.columns:
|
| 1688 |
+
cols_out.append("search_score")
|
| 1689 |
acc = [c for c in cols_out if c in tmp.columns]
|
| 1690 |
return tmp[acc].head(500)
|
| 1691 |
|
|
|
|
| 1731 |
except Exception:
|
| 1732 |
return None
|
| 1733 |
|
| 1734 |
+
def _vectorize_query(q: str, vec_state: Dict[str, Any], corpus_texts_for_fit: List[str]):
|
| 1735 |
+
# This has to re-fit a vectorizer to project the query, since partitioning means
|
| 1736 |
+
# we don't have a single global one.
|
| 1737 |
+
if vec_state.get("use_hashing"):
|
| 1738 |
+
hv = HashingVectorizer(
|
| 1739 |
+
analyzer="word", ngram_range=(1,2) if vec_state.get('use_bigrams') else (1,1),
|
| 1740 |
+
n_features=2**vec_state.get('hash_bits', 18), alternate_sign=False,
|
| 1741 |
+
token_pattern=TOKEN_PATTERN, lowercase=True, norm=None
|
| 1742 |
+
)
|
| 1743 |
+
# fit to get the transformer state on the corpus
|
| 1744 |
+
word_counts = hv.fit_transform(corpus_texts_for_fit)
|
| 1745 |
+
tfidf_tr = TfidfTransformer().fit(word_counts)
|
| 1746 |
+
q_word_counts = hv.transform([q])
|
| 1747 |
+
q_word = tfidf_tr.transform(q_word_counts)
|
| 1748 |
+
else: # BM25 path
|
| 1749 |
+
count_vec = CountVectorizer(
|
| 1750 |
+
analyzer="word", ngram_range=(1,2) if vec_state.get('use_bigrams') else (1,1),
|
| 1751 |
+
max_features=vec_state.get('max_features'), min_df=vec_state.get('min_df'),
|
| 1752 |
+
max_df=vec_state.get('max_df'), token_pattern=TOKEN_PATTERN, lowercase=True,
|
| 1753 |
+
dtype=np.float32, stop_words=STOPWORD_FOR_VEC
|
| 1754 |
+
)
|
| 1755 |
+
TF = count_vec.fit_transform(corpus_texts_for_fit)
|
| 1756 |
+
bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
|
| 1757 |
+
q_word_tf = count_vec.transform([q])
|
| 1758 |
+
q_word = bm25.transform(q_word_tf)
|
| 1759 |
+
|
| 1760 |
+
char_vec = CharTfidf(
|
| 1761 |
+
analyzer="char", ngram_range=(3,5), min_df=2, max_features=100_000,
|
| 1762 |
+
lowercase=True, dtype=np.float32
|
| 1763 |
+
).fit(corpus_texts_for_fit)
|
| 1764 |
+
|
| 1765 |
+
q_char = char_vec.transform([q])
|
| 1766 |
+
q_full = hstack([q_word, q_char * 0.20], format="csr")
|
| 1767 |
return q_full
|
| 1768 |
|
| 1769 |
def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
|
| 1770 |
if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
|
| 1771 |
return pd.DataFrame(), []
|
| 1772 |
+
|
| 1773 |
+
# align with df_main order (exclude -1 and -2)
|
| 1774 |
+
mask = ~df["cluster_id"].isin([-1, -2, -3])
|
| 1775 |
+
filtered_df = df[mask].reset_index(drop=True)
|
| 1776 |
+
if filtered_df.empty:
|
| 1777 |
+
return pd.DataFrame(), []
|
| 1778 |
+
|
| 1779 |
q_terms = _tokenize_query(q)
|
| 1780 |
+
q_vec_full = _vectorize_query(q, vec_state, corpus_texts_for_fit=list(filtered_df.apply(enrich_text, axis=1)))
|
| 1781 |
+
|
| 1782 |
+
if use_lsa_flag and (X_reduced is not None) and (svd_obj is not None) and (norm_obj is not None):
|
| 1783 |
q_emb = _project_query_to_lsa(q_vec_full, svd_obj, norm_obj)
|
| 1784 |
if q_emb is None:
|
| 1785 |
return pd.DataFrame(), q_terms
|
| 1786 |
else:
|
| 1787 |
q_emb = q_vec_full
|
| 1788 |
+
|
| 1789 |
+
n_neighbors = min(50, len(filtered_df))
|
| 1790 |
+
if n_neighbors <= 0: return pd.DataFrame(), q_terms
|
| 1791 |
+
|
|
|
|
| 1792 |
if isinstance(index_obj, NearestNeighbors):
|
| 1793 |
+
# Check if index was fit on placeholder
|
| 1794 |
+
if hasattr(index_obj, 'n_samples_fit_') and index_obj.n_samples_fit_ <= 1:
|
| 1795 |
+
return pd.DataFrame(), q_terms # cannot search
|
| 1796 |
+
distances, indices = index_obj.kneighbors(q_emb, n_neighbors=n_neighbors)
|
| 1797 |
inds = indices[0]
|
| 1798 |
sims = 1.0 - distances[0]
|
| 1799 |
results = filtered_df.iloc[inds].copy()
|
| 1800 |
results["search_score"] = sims
|
| 1801 |
elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
|
| 1802 |
+
D, I = index_obj.search(q_emb.astype(np.float32), k=n_neighbors)
|
| 1803 |
inds = I[0]
|
| 1804 |
sims = D[0]
|
| 1805 |
results = filtered_df.iloc[inds].copy()
|
|
|
|
| 1809 |
|
| 1810 |
# blend with corruption score lightly
|
| 1811 |
cs = results["corruption_score"].fillna(0.0)
|
| 1812 |
+
cs_norm = (cs - cs.min()) / (cs.max() - cs.min() + 1e-9) if (cs.max() > cs.min()) else cs
|
| 1813 |
+
results["_blend"] = 0.7*results["search_score"].values + 0.3*cs_norm.values
|
| 1814 |
# sort UI-selected way
|
| 1815 |
if sort_by == "search_score":
|
| 1816 |
results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
|
|
|
|
| 1821 |
results = results.sort_values("_blend", ascending=(sort_dir=="asc"))
|
| 1822 |
results = results.drop(columns=["_blend"])
|
| 1823 |
cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score", "search_score"]
|
| 1824 |
+
return _sort_results(results.head(50), sort_by, sort_dir), q_terms
|
| 1825 |
|
| 1826 |
search_btn.click(
|
| 1827 |
search_fn,
|
|
|
|
| 1849 |
if dstr is not None:
|
| 1850 |
cand = cand[cand["date"] == dstr]
|
| 1851 |
if len(cand) == 0:
|
| 1852 |
+
# Fallback to subject only if exact match fails
|
| 1853 |
cand = df[df["subject"] == sel.get("subject", "")]
|
| 1854 |
if len(cand) == 0:
|
| 1855 |
+
return f"Could not find original record for: {subj}"
|
| 1856 |
row = cand.iloc[0]
|
| 1857 |
+
cid = int(row.get("cluster_id", -99))
|
| 1858 |
+
clabel = term_names.get(cid, row.get("cluster_name")) if term_names else row.get("cluster_name")
|
| 1859 |
return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel, do_highlight=bool(do_highlight), extra_terms=extra_terms)
|
| 1860 |
|
| 1861 |
results_df.select(
|
|
|
|
| 1874 |
return "(any)"
|
| 1875 |
label = df_sum.iloc[row_idx]["label"]
|
| 1876 |
return label if isinstance(label, str) else "(any)"
|
| 1877 |
+
|
| 1878 |
+
# Click domain summary to filter
|
| 1879 |
+
def on_domain_click(evt: gr.SelectData, df_sum: pd.DataFrame):
|
| 1880 |
+
try:
|
| 1881 |
+
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
|
| 1882 |
+
except Exception:
|
| 1883 |
+
row_idx = evt.index if hasattr(evt, "index") else None
|
| 1884 |
+
if row_idx is None or df_sum is None or len(df_sum)==0:
|
| 1885 |
+
return "(any)"
|
| 1886 |
+
val = df_sum.iloc[row_idx]["from_domain"]
|
| 1887 |
+
return val if isinstance(val, str) and val else "(any)"
|
| 1888 |
|
| 1889 |
cluster_counts_df.select(
|
| 1890 |
on_cluster_click,
|
|
|
|
| 1895 |
inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
|
| 1896 |
outputs=[results_df]
|
| 1897 |
)
|
| 1898 |
+
|
| 1899 |
+
domain_counts_df.select(on_domain_click, inputs=[domain_counts_df], outputs=[domain_drop]) \
|
| 1900 |
+
.then(
|
| 1901 |
+
refresh_results,
|
| 1902 |
+
inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
|
| 1903 |
+
outputs=[results_df]
|
| 1904 |
+
)
|
| 1905 |
+
|
| 1906 |
+
def on_sender_click(evt: gr.SelectData, df_sum: pd.DataFrame):
|
| 1907 |
+
try:
|
| 1908 |
+
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
|
| 1909 |
+
except Exception:
|
| 1910 |
+
row_idx = evt.index if hasattr(evt, "index") else None
|
| 1911 |
+
if row_idx is None or df_sum is None or len(df_sum)==0:
|
| 1912 |
+
return "(any)"
|
| 1913 |
+
val = df_sum.iloc[row_idx]["from_email"]
|
| 1914 |
+
return val if isinstance(val, str) and val else "(any)"
|
| 1915 |
+
|
| 1916 |
+
sender_counts_df.select(on_sender_click, inputs=[sender_counts_df], outputs=[sender_drop]) \
|
| 1917 |
+
.then(
|
| 1918 |
+
refresh_results,
|
| 1919 |
+
inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
|
| 1920 |
+
outputs=[results_df]
|
| 1921 |
+
)
|
| 1922 |
+
|
| 1923 |
|
| 1924 |
if __name__ == "__main__":
|
| 1925 |
+
demo.launch()
|