Update app.py
Browse files
app.py
CHANGED
|
@@ -117,9 +117,14 @@ def is_news_like(subject: str, body: str, from_domain: str) -> bool:
|
|
| 117 |
# -------- System/notification heuristics (bucket as cluster -2) --------
|
| 118 |
NOTIFY_PATTERNS = [
|
| 119 |
r"\bno[-\s]?reply\b", r"do not reply", r"security alert", r"new sign[-\s]?in",
|
| 120 |
-
r"verification code", r"two[-\s]?factor", r"\botp\b", r"\bcode[:\s]",
|
| 121 |
r"itunes connect", r"apple id", r"your google account", r"used (?:a )?new browser",
|
| 122 |
-
r"unable to determine", r"reset your password", r"\balert\b"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
]
|
| 124 |
NOTIFY_RE = re.compile("|".join(NOTIFY_PATTERNS), re.I)
|
| 125 |
def is_notification_like(subject: str, body: str, from_email: str, from_domain: str) -> bool:
|
|
@@ -169,24 +174,49 @@ MONTHS = {
|
|
| 169 |
"january","february","march","april","june","july","august","september",
|
| 170 |
"october","november","december"
|
| 171 |
}
|
| 172 |
-
|
|
|
|
| 173 |
STOP_TERMS = {
|
| 174 |
"div","span","nbsp","href","src","img","class","style","align","border","cid",
|
| 175 |
"content","content-type","multipart","alternative","quoted","printable","utf",
|
| 176 |
"windows-1255","iso-8859","us-ascii","html","plain","attachment","filename",
|
| 177 |
-
|
| 178 |
-
"type","id","service","person","generated"
|
| 179 |
}
|
| 180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
|
| 182 |
YEAR_RE = re.compile(r"^(19|20)\d{2}$")
|
| 183 |
NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
|
| 184 |
ONE_CHAR_RE = re.compile(r"^.$")
|
| 185 |
|
|
|
|
|
|
|
|
|
|
| 186 |
def _is_junk_term(t: str) -> bool:
|
| 187 |
tl = t.lower()
|
| 188 |
-
if tl in STOP_TERMS
|
| 189 |
-
if tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
|
| 190 |
return True
|
| 191 |
if EMAIL_LIKE_RE.search(tl): return True
|
| 192 |
if YEAR_RE.match(tl): return True
|
|
@@ -241,6 +271,9 @@ def strip_quotes_and_sigs(text: str) -> str:
|
|
| 241 |
cut = idx if (cut is None or idx < cut) else cut
|
| 242 |
if cut is not None:
|
| 243 |
text = text[:cut]
|
|
|
|
|
|
|
|
|
|
| 244 |
return text.strip()
|
| 245 |
|
| 246 |
def parse_name_email(s: str) -> Tuple[str, str]:
|
|
@@ -581,8 +614,16 @@ def enrich_text(row: pd.Series) -> str:
|
|
| 581 |
tokens.append(lang_tok)
|
| 582 |
return (t + " " + " ".join(tokens)).strip()
|
| 583 |
|
| 584 |
-
# =================== Cluster labeling:
|
| 585 |
-
def cluster_labels_pmi_bigram(texts, labels, topn=6):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 586 |
import math as _math
|
| 587 |
from collections import Counter, defaultdict
|
| 588 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
@@ -596,7 +637,9 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
|
|
| 596 |
if tl in STOP_TERMS: return True # extra HTML/MIME junk
|
| 597 |
if tl in HEADER_STOP: return True
|
| 598 |
if "@" in tl: return True
|
| 599 |
-
# drop
|
|
|
|
|
|
|
| 600 |
if re.search(r"[^\w\-']", tl):
|
| 601 |
if "’" not in tl and "'" not in tl:
|
| 602 |
return True
|
|
@@ -612,19 +655,37 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
|
|
| 612 |
glob_bg = Counter()
|
| 613 |
per_c_bg = defaultdict(Counter)
|
| 614 |
per_c_texts = defaultdict(list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
|
| 616 |
-
for txt, c in zip(texts, labels):
|
|
|
|
| 617 |
toks = tokenize_clean(txt)
|
| 618 |
bgs = set(bigrams(toks))
|
| 619 |
glob_bg.update(bgs)
|
| 620 |
-
per_c_bg[
|
| 621 |
-
per_c_texts[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
|
| 623 |
labels_out = {}
|
| 624 |
total_bg = sum(glob_bg.values()) + 1e-12
|
| 625 |
|
| 626 |
for c in sorted(set(int(x) for x in labels)):
|
| 627 |
-
|
|
|
|
|
|
|
| 628 |
scores = []
|
| 629 |
total_c = sum(per_c_bg[c].values()) + 1e-12
|
| 630 |
for bg, cnt in per_c_bg[c].most_common(2000):
|
|
@@ -632,11 +693,16 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
|
|
| 632 |
p_bg = (glob_bg[bg] / total_bg)
|
| 633 |
if p_bg > 0 and p_bg_c > 0:
|
| 634 |
score = _math.log(p_bg_c) - _math.log(p_bg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
scores.append((score, bg))
|
| 636 |
scores.sort(reverse=True)
|
| 637 |
top_bi = [bg for _, bg in scores[: max(2, topn//2) ]]
|
| 638 |
|
| 639 |
-
# class-TFIDF unigrams (cluster doc vs. background doc)
|
| 640 |
docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
|
| 641 |
docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k!=c), [])) or " "]
|
| 642 |
corpus = [docs_c[0], docs_bg[0]]
|
|
@@ -647,7 +713,22 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
|
|
| 647 |
X = vec.fit_transform(corpus)
|
| 648 |
vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
|
| 649 |
row = X[0].toarray().ravel()
|
| 650 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 651 |
top_uni = []
|
| 652 |
for i in top_idx:
|
| 653 |
tok = vocab[i]
|
|
@@ -696,7 +777,7 @@ def merge_close_clusters(labels, centers, thresh=0.92):
|
|
| 696 |
while parent[a]!=a: a=parent[a]
|
| 697 |
return a
|
| 698 |
for i in range(k):
|
| 699 |
-
for j in range(i+1, k):
|
| 700 |
if sim[i,j] >= thresh:
|
| 701 |
pi, pj = find(i), find(j)
|
| 702 |
if pi!=pj: parent[pj]=pi
|
|
@@ -1037,6 +1118,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1037 |
|
| 1038 |
# Enriched texts (adds __HAS_*__ flags + __LANG__)
|
| 1039 |
texts = list(df_main.apply(enrich_text, axis=1))
|
|
|
|
| 1040 |
|
| 1041 |
# === Vectorization ===
|
| 1042 |
ngram_range = (1, 2) if use_bigrams else (1, 1)
|
|
@@ -1049,6 +1131,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1049 |
token_pattern=TOKEN_PATTERN,
|
| 1050 |
lowercase=True,
|
| 1051 |
dtype=np.float32,
|
|
|
|
| 1052 |
)
|
| 1053 |
TF = count_vec.fit_transform(texts)
|
| 1054 |
bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
|
|
@@ -1060,8 +1143,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1060 |
)
|
| 1061 |
X_char = char_vec.fit_transform(texts)
|
| 1062 |
|
| 1063 |
-
# Down-weight char-grams so they don't dominate geometry
|
| 1064 |
-
X_full = hstack([X_word, X_char * 0.
|
| 1065 |
d_word = X_word.shape[1]
|
| 1066 |
d_char = X_char.shape[1]
|
| 1067 |
d_full = X_full.shape[1]
|
|
@@ -1125,7 +1208,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1125 |
|
| 1126 |
# Attach clustering back to df_main
|
| 1127 |
df_main["cluster_id"] = labels
|
| 1128 |
-
term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
|
| 1129 |
df_main["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
|
| 1130 |
df_main["anomaly_score"] = anomaly_scores
|
| 1131 |
|
|
@@ -1225,7 +1308,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1225 |
|
| 1226 |
status_md = (
|
| 1227 |
f"**Processed {len(df):,} emails** \n"
|
| 1228 |
-
f"Word feats (BM25): {d_word:,} | Char feats: {d_char:,} (x0.
|
| 1229 |
f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
|
| 1230 |
f"k = {k} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'} | "
|
| 1231 |
f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
|
|
@@ -1355,7 +1438,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1355 |
else:
|
| 1356 |
q_emb = q_vec_full
|
| 1357 |
|
| 1358 |
-
#
|
| 1359 |
mask = ~df["cluster_id"].isin([-1, -2])
|
| 1360 |
filtered_df = df[mask]
|
| 1361 |
|
|
|
|
| 117 |
# -------- System/notification heuristics (bucket as cluster -2) --------
|
| 118 |
NOTIFY_PATTERNS = [
|
| 119 |
r"\bno[-\s]?reply\b", r"do not reply", r"security alert", r"new sign[-\s]?in",
|
| 120 |
+
r"verification code", r"two[-\s]?factor|\b2fa\b", r"\botp\b", r"\bcode[:\s]",
|
| 121 |
r"itunes connect", r"apple id", r"your google account", r"used (?:a )?new browser",
|
| 122 |
+
r"unable to determine", r"reset your password", r"\balert\b",
|
| 123 |
+
# bounces / gateways / quarantine
|
| 124 |
+
r"mailer[-\s]?daemon", r"\bpostmaster\b", r"delivery status notification",
|
| 125 |
+
r"undeliverable", r"delivery failure", r"returned mail", r"mail delivery subsystem",
|
| 126 |
+
r"proofpoint", r"mimecast", r"dmarc", r"\bspf\b", r"\bdkim\b", r"quarantine",
|
| 127 |
+
r"spam digest", r"phishing", r"security gateway", r"mail[-\s]?secure|secure message"
|
| 128 |
]
|
| 129 |
NOTIFY_RE = re.compile("|".join(NOTIFY_PATTERNS), re.I)
|
| 130 |
def is_notification_like(subject: str, body: str, from_email: str, from_domain: str) -> bool:
|
|
|
|
| 174 |
"january","february","march","april","june","july","august","september",
|
| 175 |
"october","november","december"
|
| 176 |
}
|
| 177 |
+
|
| 178 |
+
# Extra junk/HTML/MIME terms to suppress in labels (expanded)
|
| 179 |
STOP_TERMS = {
|
| 180 |
"div","span","nbsp","href","src","img","class","style","align","border","cid",
|
| 181 |
"content","content-type","multipart","alternative","quoted","printable","utf",
|
| 182 |
"windows-1255","iso-8859","us-ascii","html","plain","attachment","filename",
|
| 183 |
+
"type","id","service","person","generated","fyi"
|
|
|
|
| 184 |
}
|
| 185 |
|
| 186 |
+
# NEW: broader stop buckets for labels *and* features
|
| 187 |
+
AUX_STOP = {
|
| 188 |
+
"will","would","should","could","can","cant","cannot","did","do","does","done",
|
| 189 |
+
"have","has","had","having","get","got","make","made","let","need","want",
|
| 190 |
+
"not","dont","didnt","isnt","arent","wasnt","werent","im","youre","hes","shes",
|
| 191 |
+
"weve","ive","theyre","its","ok","okay","pls","please","thx","thanks","regards","best",
|
| 192 |
+
"hi","hello","dear","re","fw","fwd","via","kind"
|
| 193 |
+
}
|
| 194 |
+
CTA_STOP = {
|
| 195 |
+
"click","here","unsubscribe","view","browser","mailto","reply","iphone","android",
|
| 196 |
+
"press","link","below","above","update","newsletter","manage","preferences",
|
| 197 |
+
"לחץ","כאן","נשלח","מה","מה-iphone","הטלפון"
|
| 198 |
+
}
|
| 199 |
+
TECH_META = {
|
| 200 |
+
"quot","nbsp","cid","href","src","img","class","style","div","span","http","https",
|
| 201 |
+
"content","content-type","multipart","alternative","quoted","printable","utf",
|
| 202 |
+
"windows-1255","iso-8859","us-ascii","attachment","filename"
|
| 203 |
+
}
|
| 204 |
+
ZH_HEADER_STOP = {"发送时间","星期","星期一","星期二","星期三","星期四","星期五","星期六","星期日","转发","主题","收件人","发件人"}
|
| 205 |
+
HE_EXTRA_STOP = {"עם","או"}
|
| 206 |
+
|
| 207 |
+
# fold into STOP_TERMS and build a vectorizer stoplist
|
| 208 |
+
STOP_TERMS |= AUX_STOP | CTA_STOP | TECH_META | ZH_HEADER_STOP | HE_EXTRA_STOP
|
| 209 |
EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
|
| 210 |
YEAR_RE = re.compile(r"^(19|20)\d{2}$")
|
| 211 |
NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
|
| 212 |
ONE_CHAR_RE = re.compile(r"^.$")
|
| 213 |
|
| 214 |
+
# This stoplist is used by the CountVectorizer
|
| 215 |
+
STOPWORD_FOR_VEC = EN_STOP | HE_STOP | STOP_TERMS
|
| 216 |
+
|
| 217 |
def _is_junk_term(t: str) -> bool:
|
| 218 |
tl = t.lower()
|
| 219 |
+
if tl in STOP_TERMS or tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
|
|
|
|
| 220 |
return True
|
| 221 |
if EMAIL_LIKE_RE.search(tl): return True
|
| 222 |
if YEAR_RE.match(tl): return True
|
|
|
|
| 271 |
cut = idx if (cut is None or idx < cut) else cut
|
| 272 |
if cut is not None:
|
| 273 |
text = text[:cut]
|
| 274 |
+
# extra safety for mobile signatures that sneak through
|
| 275 |
+
text = re.sub(r"\n\s*sent from my .*?$", "", text, flags=re.I|re.M)
|
| 276 |
+
text = re.sub(r"\n\s*(נשלח מה-?iphone).*?$", "", text, flags=re.I|re.M)
|
| 277 |
return text.strip()
|
| 278 |
|
| 279 |
def parse_name_email(s: str) -> Tuple[str, str]:
|
|
|
|
| 614 |
tokens.append(lang_tok)
|
| 615 |
return (t + " " + " ".join(tokens)).strip()
|
| 616 |
|
| 617 |
+
# =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST ===================
|
| 618 |
+
def cluster_labels_pmi_bigram(texts, labels, subjects=None, topn=6, subject_alpha=0.75):
|
| 619 |
+
"""
|
| 620 |
+
Create human-readable labels per cluster using:
|
| 621 |
+
1) PMI bigrams (cluster vs global) + subject coverage boost
|
| 622 |
+
2) Class-TFIDF unigrams (cluster vs rest) + subject coverage boost
|
| 623 |
+
|
| 624 |
+
`subjects`: list of subject strings aligned with `texts`
|
| 625 |
+
`subject_alpha`: weight added per token = alpha * coverage_in_subjects (0..1)
|
| 626 |
+
"""
|
| 627 |
import math as _math
|
| 628 |
from collections import Counter, defaultdict
|
| 629 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
| 637 |
if tl in STOP_TERMS: return True # extra HTML/MIME junk
|
| 638 |
if tl in HEADER_STOP: return True
|
| 639 |
if "@" in tl: return True
|
| 640 |
+
# drop short ASCII like "eb/ys/yl"
|
| 641 |
+
if tl.isascii() and len(tl) <= 2: return True
|
| 642 |
+
# punctuation blobs (keep apostrophes)
|
| 643 |
if re.search(r"[^\w\-']", tl):
|
| 644 |
if "’" not in tl and "'" not in tl:
|
| 645 |
return True
|
|
|
|
| 655 |
glob_bg = Counter()
|
| 656 |
per_c_bg = defaultdict(Counter)
|
| 657 |
per_c_texts = defaultdict(list)
|
| 658 |
+
per_c_doc_count = defaultdict(int)
|
| 659 |
+
|
| 660 |
+
# SUBJECT presence (unique tokens/bigrams per subject per doc)
|
| 661 |
+
per_c_subj_uni_docs = defaultdict(Counter)
|
| 662 |
+
per_c_subj_bg_docs = defaultdict(Counter)
|
| 663 |
+
|
| 664 |
+
have_subjects = subjects is not None and len(subjects) == len(texts)
|
| 665 |
|
| 666 |
+
for idx, (txt, c) in enumerate(zip(texts, labels)):
|
| 667 |
+
c = int(c)
|
| 668 |
toks = tokenize_clean(txt)
|
| 669 |
bgs = set(bigrams(toks))
|
| 670 |
glob_bg.update(bgs)
|
| 671 |
+
per_c_bg[c].update(bgs)
|
| 672 |
+
per_c_texts[c].append(" ".join(toks))
|
| 673 |
+
per_c_doc_count[c] += 1
|
| 674 |
+
|
| 675 |
+
if have_subjects:
|
| 676 |
+
subj_toks = tokenize_clean(subjects[idx] or "")
|
| 677 |
+
subj_uni_set = set(subj_toks)
|
| 678 |
+
subj_bg_set = set(bigrams(subj_toks))
|
| 679 |
+
per_c_subj_uni_docs[c].update(subj_uni_set)
|
| 680 |
+
per_c_subj_bg_docs[c].update(subj_bg_set)
|
| 681 |
|
| 682 |
labels_out = {}
|
| 683 |
total_bg = sum(glob_bg.values()) + 1e-12
|
| 684 |
|
| 685 |
for c in sorted(set(int(x) for x in labels)):
|
| 686 |
+
n_docs_c = max(1, per_c_doc_count[c])
|
| 687 |
+
|
| 688 |
+
# PMI bigrams (+ subject boost)
|
| 689 |
scores = []
|
| 690 |
total_c = sum(per_c_bg[c].values()) + 1e-12
|
| 691 |
for bg, cnt in per_c_bg[c].most_common(2000):
|
|
|
|
| 693 |
p_bg = (glob_bg[bg] / total_bg)
|
| 694 |
if p_bg > 0 and p_bg_c > 0:
|
| 695 |
score = _math.log(p_bg_c) - _math.log(p_bg)
|
| 696 |
+
# subject coverage boost: fraction of cluster docs whose SUBJECT contains this bigram
|
| 697 |
+
cov = 0.0
|
| 698 |
+
if have_subjects:
|
| 699 |
+
cov = per_c_subj_bg_docs[c][bg] / n_docs_c
|
| 700 |
+
score = score + subject_alpha * cov
|
| 701 |
scores.append((score, bg))
|
| 702 |
scores.sort(reverse=True)
|
| 703 |
top_bi = [bg for _, bg in scores[: max(2, topn//2) ]]
|
| 704 |
|
| 705 |
+
# class-TFIDF unigrams (cluster doc vs. background doc) + subject boost
|
| 706 |
docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
|
| 707 |
docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k!=c), [])) or " "]
|
| 708 |
corpus = [docs_c[0], docs_bg[0]]
|
|
|
|
| 713 |
X = vec.fit_transform(corpus)
|
| 714 |
vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
|
| 715 |
row = X[0].toarray().ravel()
|
| 716 |
+
|
| 717 |
+
# Build subject coverage vector over this vocab
|
| 718 |
+
subj_cov = np.zeros_like(row)
|
| 719 |
+
if have_subjects:
|
| 720 |
+
vocab_index = {t:i for i,t in enumerate(vocab)}
|
| 721 |
+
for tok, cnt_docs in per_c_subj_uni_docs[c].items():
|
| 722 |
+
if tok in vocab_index:
|
| 723 |
+
subj_cov[vocab_index[tok]] = cnt_docs / n_docs_c # 0..1
|
| 724 |
+
|
| 725 |
+
# Apply boost (only to non-junk tokens)
|
| 726 |
+
row_boosted = row.copy()
|
| 727 |
+
for i, tok in enumerate(vocab):
|
| 728 |
+
if subj_cov[i] > 0 and not is_junk_token(tok):
|
| 729 |
+
row_boosted[i] = row[i] + subject_alpha * float(subj_cov[i])
|
| 730 |
+
|
| 731 |
+
top_idx = row_boosted.argsort()[::-1][: max(0, topn - len(top_bi)) ]
|
| 732 |
top_uni = []
|
| 733 |
for i in top_idx:
|
| 734 |
tok = vocab[i]
|
|
|
|
| 777 |
while parent[a]!=a: a=parent[a]
|
| 778 |
return a
|
| 779 |
for i in range(k):
|
| 780 |
+
for j in range(i+1, j := k):
|
| 781 |
if sim[i,j] >= thresh:
|
| 782 |
pi, pj = find(i), find(j)
|
| 783 |
if pi!=pj: parent[pj]=pi
|
|
|
|
| 1118 |
|
| 1119 |
# Enriched texts (adds __HAS_*__ flags + __LANG__)
|
| 1120 |
texts = list(df_main.apply(enrich_text, axis=1))
|
| 1121 |
+
subjects_only = list(df_main["subject"].fillna(""))
|
| 1122 |
|
| 1123 |
# === Vectorization ===
|
| 1124 |
ngram_range = (1, 2) if use_bigrams else (1, 1)
|
|
|
|
| 1131 |
token_pattern=TOKEN_PATTERN,
|
| 1132 |
lowercase=True,
|
| 1133 |
dtype=np.float32,
|
| 1134 |
+
stop_words=STOPWORD_FOR_VEC, # <-- use expanded stoplist
|
| 1135 |
)
|
| 1136 |
TF = count_vec.fit_transform(texts)
|
| 1137 |
bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
|
|
|
|
| 1143 |
)
|
| 1144 |
X_char = char_vec.fit_transform(texts)
|
| 1145 |
|
| 1146 |
+
# Down-weight char-grams so they don't dominate geometry (slightly lower)
|
| 1147 |
+
X_full = hstack([X_word, X_char * 0.20], format="csr")
|
| 1148 |
d_word = X_word.shape[1]
|
| 1149 |
d_char = X_char.shape[1]
|
| 1150 |
d_full = X_full.shape[1]
|
|
|
|
| 1208 |
|
| 1209 |
# Attach clustering back to df_main
|
| 1210 |
df_main["cluster_id"] = labels
|
| 1211 |
+
term_names = cluster_labels_pmi_bigram(texts, labels, subjects=subjects_only, topn=6, subject_alpha=0.75)
|
| 1212 |
df_main["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
|
| 1213 |
df_main["anomaly_score"] = anomaly_scores
|
| 1214 |
|
|
|
|
| 1308 |
|
| 1309 |
status_md = (
|
| 1310 |
f"**Processed {len(df):,} emails** \n"
|
| 1311 |
+
f"Word feats (BM25): {d_word:,} | Char feats: {d_char:,} (x0.20) | Total: {d_full:,} \n"
|
| 1312 |
f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
|
| 1313 |
f"k = {k} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'} | "
|
| 1314 |
f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
|
|
|
|
| 1438 |
else:
|
| 1439 |
q_emb = q_vec_full
|
| 1440 |
|
| 1441 |
+
# align with df_main order (exclude -1 and -2)
|
| 1442 |
mask = ~df["cluster_id"].isin([-1, -2])
|
| 1443 |
filtered_df = df[mask]
|
| 1444 |
|