Update app.py
Browse files
app.py
CHANGED
|
@@ -72,7 +72,7 @@ TAXONOMY = {
|
|
| 72 |
"HR/Admin": ["hiring","personnel","payroll","benefits","policy","vacation","pto"],
|
| 73 |
"Constituent": ["constituent","concerned citizen","my issue","complaint","community"],
|
| 74 |
"Scheduling": ["schedule","meeting","appointment","calendar","invite","availability","reschedule"],
|
| 75 |
-
"Legal": ["legal","lawsuit","attorney","counsel","privileged","court","subpoena","confidential"],
|
| 76 |
"IT/Security": ["password","account security","two-factor","2fa","vpn","verification code","security alert","it support"],
|
| 77 |
"Newsletters/Alerts": ["newsletter","daily briefing","news update","unsubscribe","press clip","digest"],
|
| 78 |
"Other": [],
|
|
@@ -94,38 +94,31 @@ def _bucket_header_bonus(row: pd.Series, bucket: str) -> float:
|
|
| 94 |
if bucket == "IT/Security":
|
| 95 |
return 5.0 if is_notification_like(subj, row.get("body_text",""), row.get("from_email",""), fd) else 0.0
|
| 96 |
if bucket == "Constituent":
|
| 97 |
-
# personal mail to public office is a strong hint
|
| 98 |
return 3.0 if (fd in PERSONAL_DOMAINS) else 0.0
|
| 99 |
if bucket == "Lobbyist":
|
| 100 |
return 5.0 if fd in LOBBY_DOMAINS else 0.0
|
| 101 |
if bucket == "Legal":
|
| 102 |
return 5.0 if (("law" in fd) or (fd in LEGAL_DOMAINS) or ("privileged" in subj.lower())) else 0.0
|
| 103 |
if bucket == "Scheduling":
|
| 104 |
-
# ICS invite or explicit invite subject
|
| 105 |
body = (row.get("body_text") or "")
|
| 106 |
return 3.0 if (ATTACH_NAME_RE.search(" ".join(row.get("attachments") or [])) or re.search(r"\binvitation\b|\binvite\b", subj, re.I) or re.search(r"\.ics\b", body, re.I)) else 0.0
|
| 107 |
return 0.0
|
| 108 |
|
| 109 |
-
MIN_ROUTE_SCORE = 1.5
|
| 110 |
TIE_MARGIN = 1.0
|
| 111 |
|
| 112 |
def route_email_row(row: pd.Series) -> str:
|
| 113 |
text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
|
| 114 |
scores: dict = {b: 0.0 for b in TAXONOMY.keys()}
|
| 115 |
-
# lexicon points
|
| 116 |
for b, terms in TAXONOMY.items():
|
| 117 |
-
if not terms:
|
| 118 |
continue
|
| 119 |
-
# count unique term hits to avoid over-crediting repeats
|
| 120 |
hits = sum(1 for t in terms if t and t.lower() in text)
|
| 121 |
scores[b] += float(hits)
|
| 122 |
-
# strong phrases in your corruption lexicon can hint Lobbyist/Procurement
|
| 123 |
if b in ("Lobbyist","Procurement") and any(p in text for p in SUSPECT_PHRASES):
|
| 124 |
scores[b] += 1.0
|
| 125 |
-
# header bonuses
|
| 126 |
for b in TAXONOMY.keys():
|
| 127 |
scores[b] += _bucket_header_bonus(row, b)
|
| 128 |
-
# choose
|
| 129 |
best_bucket, best = max(scores.items(), key=lambda kv: kv[1])
|
| 130 |
second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
|
| 131 |
if best < MIN_ROUTE_SCORE or (best - second) < TIE_MARGIN:
|
|
@@ -147,35 +140,28 @@ SKIP_LANGDETECT = True
|
|
| 147 |
|
| 148 |
# ==== Expanded corruption lexicon ====
|
| 149 |
SUSPECT_PHRASES = [
|
| 150 |
-
# core corruption/finance
|
| 151 |
"off the books","cover up","kickback","bribe","under the table",
|
| 152 |
"no inspection","special fee","friendly payment","confidential deal",
|
| 153 |
"nobody will find out","pay to play","cash only","shell company",
|
| 154 |
"bid rigging","embezzle","slush fund","false invoice","ghost employee",
|
| 155 |
"contract splitting","grease payment","unreported","unrecorded",
|
| 156 |
-
# secrecy/evasion
|
| 157 |
"off the record","just between us","don’t quote me on this","dont quote me on this",
|
| 158 |
"we never had this conversation","keep this between us","not ethical","illegal",
|
| 159 |
"grey area","gray area","write off","failed investment","they owe it to me",
|
| 160 |
-
# off-channel comms
|
| 161 |
"let’s take this offline","lets take this offline","send to my gmail","send to my yahoo",
|
| 162 |
"don’t leave a trail","dont leave a trail","call my cell","text me","don’t text me","dont text me",
|
| 163 |
"tell you on the phone","talk in person","come by my office","vpn",
|
| 164 |
-
# financial secrecy & accounting games
|
| 165 |
"tax haven","off-shore account","offshore account","backdate","pull earnings forward",
|
| 166 |
"delete this email","no inspection","special fees","wire instructions",
|
| 167 |
]
|
| 168 |
-
# Evasive acronyms / slang (case-insensitive)
|
| 169 |
EVASIVE_ACRO_RE = re.compile(r'\b(?:TYOP|LDL|TOL|OTR|TXT|TYL)\b', re.I)
|
| 170 |
|
| 171 |
-
# Entity regexes
|
| 172 |
MONEY_RE = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
|
| 173 |
PHONE_RE = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
|
| 174 |
INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order|wire)\b', re.I)
|
| 175 |
COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
|
| 176 |
ATTACH_NAME_RE = re.compile(r'\b(agreement|contract|invoice|wire|payment|instructions|accounts?|offshore|tax|statement)\b', re.I)
|
| 177 |
|
| 178 |
-
# Off-channel patterns (apps / phrases)
|
| 179 |
OFFCHANNEL_PATTERNS = [
|
| 180 |
r"\bwhatsapp\b", r"\bsignal\b", r"\btelegram\b", r"\bwechat\b",
|
| 181 |
r"send to my (gmail|yahoo|protonmail)", r"(call|text) (me|my cell)",
|
|
@@ -184,10 +170,8 @@ OFFCHANNEL_PATTERNS = [
|
|
| 184 |
]
|
| 185 |
OFFCHANNEL_RE = re.compile("|".join(OFFCHANNEL_PATTERNS), re.I)
|
| 186 |
|
| 187 |
-
# Common personal mail domains (used with user-specified trusted org domains)
|
| 188 |
PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
|
| 189 |
|
| 190 |
-
# Newsletter/newswire heuristics
|
| 191 |
NEWS_DOMAINS = {"nytimes.com","ft.com","wsj.com","bloomberg.com","reuters.com","theguardian.com","economist.com"}
|
| 192 |
def is_news_like(subject: str, body: str, from_domain: str) -> bool:
|
| 193 |
s = (subject or "").lower()
|
|
@@ -198,13 +182,11 @@ def is_news_like(subject: str, body: str, from_domain: str) -> bool:
|
|
| 198 |
if any(d in fd for d in NEWS_DOMAINS): return True
|
| 199 |
return False
|
| 200 |
|
| 201 |
-
# -------- System/notification heuristics (bucket as cluster -2) --------
|
| 202 |
NOTIFY_PATTERNS = [
|
| 203 |
r"\bno[-\s]?reply\b", r"do not reply", r"security alert", r"new sign[-\s]?in",
|
| 204 |
r"verification code", r"two[-\s]?factor", r"\b2fa\b", r"\botp\b", r"\bcode[:\s]",
|
| 205 |
r"itunes connect", r"apple id", r"your google account", r"used (?:a )?new browser",
|
| 206 |
r"unable to determine", r"reset your password", r"\balert\b",
|
| 207 |
-
# bounces / gateways / quarantine
|
| 208 |
r"mailer[-\s]?daemon", r"\bpostmaster\b", r"delivery status notification",
|
| 209 |
r"undeliverable", r"delivery failure", r"returned mail", r"mail delivery subsystem",
|
| 210 |
r"proofpoint", r"mimecast", r"dmarc", r"\bspf\b", r"\bdkim\b", r"quarantine",
|
|
@@ -221,7 +203,6 @@ def is_notification_like(subject: str, body: str, from_email: str, from_domain:
|
|
| 221 |
return True
|
| 222 |
return False
|
| 223 |
|
| 224 |
-
# -------- Fast language heuristic (used when skipping langdetect or on failure) --------
|
| 225 |
HEB_RE = re.compile(r'[\u0590-\u05FF]')
|
| 226 |
AR_RE = re.compile(r'[\u0600-\u06FF]')
|
| 227 |
CYR_RE = re.compile(r'[\u0400-\u04FF]')
|
|
@@ -236,7 +217,6 @@ def fast_lang_heuristic(text: str) -> str:
|
|
| 236 |
return "en"
|
| 237 |
return "unknown"
|
| 238 |
|
| 239 |
-
# Optional seeded themes for semi-supervised init (used only when LSA is ON)
|
| 240 |
CORR_LEX = {
|
| 241 |
"kickback" : ["kickback","bribe","under the table","gift","cash"],
|
| 242 |
"invoice_fraud" : ["false invoice","ghost employee","contract splitting","slush fund","shell company","front company"],
|
|
@@ -258,16 +238,12 @@ MONTHS = {
|
|
| 258 |
"january","february","march","april","june","july","august","september",
|
| 259 |
"october","november","december"
|
| 260 |
}
|
| 261 |
-
|
| 262 |
-
# Extra junk/HTML/MIME terms to suppress in labels (expanded)
|
| 263 |
STOP_TERMS = {
|
| 264 |
"div","span","nbsp","href","src","img","class","style","align","border","cid",
|
| 265 |
"content","content-type","multipart","alternative","quoted","printable","utf",
|
| 266 |
"windows-1255","iso-8859","us-ascii","html","plain","attachment","filename",
|
| 267 |
"type","id","service","person","generated","fyi"
|
| 268 |
}
|
| 269 |
-
|
| 270 |
-
# NEW: broader stop buckets for labels *and* features
|
| 271 |
AUX_STOP = {
|
| 272 |
"will","would","should","could","can","cant","cannot","did","do","does","done",
|
| 273 |
"have","has","had","having","get","got","make","made","let","need","want",
|
|
@@ -288,20 +264,17 @@ TECH_META = {
|
|
| 288 |
ZH_HEADER_STOP = {"发送时间","星期","星期一","星期二","星期三","星期四","星期五","星期六","星期日","转发","主题","收件人","发件人"}
|
| 289 |
HE_EXTRA_STOP = {"עם","או"}
|
| 290 |
|
| 291 |
-
# fold into STOP_TERMS and build a vectorizer stoplist
|
| 292 |
STOP_TERMS |= AUX_STOP | CTA_STOP | TECH_META | ZH_HEADER_STOP | HE_EXTRA_STOP
|
| 293 |
EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
|
| 294 |
YEAR_RE = re.compile(r"^(19|20)\d{2}$")
|
| 295 |
NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
|
| 296 |
ONE_CHAR_RE = re.compile(r"^.$")
|
| 297 |
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
DIGIT_HEAVY_RE = re.compile(r"^(?:\D*\d){6,}\D*$") # too many digits
|
| 302 |
UNDERSCORE_HEAVY_RE = re.compile(r"^[A-Za-z0-9]*_[A-Za-z0-9_]*$")
|
| 303 |
|
| 304 |
-
# This stoplist is used by the CountVectorizer (MUST be list for sklearn)
|
| 305 |
STOPWORD_FOR_VEC = sorted(EN_STOP | HE_STOP | STOP_TERMS)
|
| 306 |
|
| 307 |
def _is_junk_term(t: str) -> bool:
|
|
@@ -366,7 +339,6 @@ def strip_quotes_and_sigs(text: str) -> str:
|
|
| 366 |
cut = idx if (cut is None or idx < cut) else cut
|
| 367 |
if cut is not None:
|
| 368 |
text = text[:cut]
|
| 369 |
-
# extra safety for mobile signatures that sneak through
|
| 370 |
text = re.sub(r"\n\s*sent from my .*?$", "", text, flags=re.I|re.M)
|
| 371 |
text = re.sub(r"\n\s*(נשלח מה-?iphone).*?$", "", text, flags=re.I|re.M)
|
| 372 |
return text.strip()
|
|
@@ -439,7 +411,6 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
|
|
| 439 |
if str(raw.get("type", "")).lower() == "meta":
|
| 440 |
return {}
|
| 441 |
|
| 442 |
-
# attachments (names); accept common schemas
|
| 443 |
attach_names = []
|
| 444 |
atts = raw.get("attachments") or raw.get("Attachments") or raw.get("files") or []
|
| 445 |
if isinstance(atts, list):
|
|
@@ -490,7 +461,6 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
|
|
| 490 |
|
| 491 |
subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
|
| 492 |
|
| 493 |
-
# Language (use fast heuristic if skipping or detector fails)
|
| 494 |
if use_langdetect:
|
| 495 |
try:
|
| 496 |
lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
|
|
@@ -562,30 +532,23 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 562 |
df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
|
| 563 |
return df
|
| 564 |
|
| 565 |
-
# Visual highlight helpers
|
| 566 |
def _compile_highlight_terms(row: pd.Series, extra_terms: List[str]) -> List[str]:
|
| 567 |
terms = []
|
| 568 |
txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
|
| 569 |
-
# suspect phrases found in this row
|
| 570 |
for p in SUSPECT_PHRASES:
|
| 571 |
if p in txt:
|
| 572 |
terms.append(p)
|
| 573 |
-
# entity markers
|
| 574 |
if MONEY_RE.search(txt): terms.append("$")
|
| 575 |
if INVOICE_RE.search(txt): terms.append("invoice")
|
| 576 |
-
# regex-based (keep as literal samples)
|
| 577 |
if PHONE_RE.search(row.get("body_text","") or ""): terms.append("phone")
|
| 578 |
-
# extras from user input
|
| 579 |
for t in extra_terms or []:
|
| 580 |
t=t.strip()
|
| 581 |
if t and t.lower() in txt:
|
| 582 |
terms.append(t)
|
| 583 |
-
# dedupe
|
| 584 |
out, seen = [], set()
|
| 585 |
for t in terms:
|
| 586 |
if t.lower() not in seen:
|
| 587 |
-
out.append(t)
|
| 588 |
-
seen.add(t.lower())
|
| 589 |
return out[:24]
|
| 590 |
|
| 591 |
def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None,
|
|
@@ -603,7 +566,6 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
|
|
| 603 |
hl_terms = []
|
| 604 |
if do_highlight:
|
| 605 |
hl_terms = (query_terms or []) + _compile_highlight_terms(row, extra_terms or [])
|
| 606 |
-
# make unique, case-insensitive
|
| 607 |
seen=set(); uniq=[]
|
| 608 |
for t in hl_terms:
|
| 609 |
tl=t.lower()
|
|
@@ -668,11 +630,6 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
|
|
| 668 |
|
| 669 |
# ---------- Lightweight Embedding Utilities (Optional) ----------
|
| 670 |
def _load_embeddings(emb_path: str, binary: bool):
|
| 671 |
-
"""
|
| 672 |
-
Load word vectors with Gensim if available.
|
| 673 |
-
Accepts word2vec binary (.bin) or text formats (.txt/.vec).
|
| 674 |
-
Returns (model, dim) or (None, 0) if not available.
|
| 675 |
-
"""
|
| 676 |
if not GENSIM_OK or not emb_path or not os.path.exists(emb_path):
|
| 677 |
return None, 0
|
| 678 |
try:
|
|
@@ -682,7 +639,6 @@ def _load_embeddings(emb_path: str, binary: bool):
|
|
| 682 |
kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=False)
|
| 683 |
return kv, int(kv.vector_size)
|
| 684 |
except Exception:
|
| 685 |
-
# Attempt GloVe-like with headerless text
|
| 686 |
try:
|
| 687 |
kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=True)
|
| 688 |
return kv, int(kv.vector_size)
|
|
@@ -690,10 +646,6 @@ def _load_embeddings(emb_path: str, binary: bool):
|
|
| 690 |
return None, 0
|
| 691 |
|
| 692 |
def _avg_embed_for_text(text: str, kv, dim: int) -> np.ndarray:
|
| 693 |
-
"""
|
| 694 |
-
Average embeddings over tokens matched by TOKEN_PATTERN.
|
| 695 |
-
Returns zero vector if nothing matches or kv is None.
|
| 696 |
-
"""
|
| 697 |
vec = np.zeros((dim,), dtype=np.float32)
|
| 698 |
if not kv or not text:
|
| 699 |
return vec
|
|
@@ -705,16 +657,12 @@ def _avg_embed_for_text(text: str, kv, dim: int) -> np.ndarray:
|
|
| 705 |
cnt += 1
|
| 706 |
if cnt > 0:
|
| 707 |
vec /= float(cnt)
|
| 708 |
-
# L2-normalize
|
| 709 |
n = np.linalg.norm(vec)
|
| 710 |
if n > 0:
|
| 711 |
vec /= n
|
| 712 |
return vec
|
| 713 |
|
| 714 |
def _build_doc_embeddings(texts: List[str], kv, dim: int) -> np.ndarray:
|
| 715 |
-
"""
|
| 716 |
-
Build [n_docs, dim] dense matrix of averaged embeddings.
|
| 717 |
-
"""
|
| 718 |
if not kv or dim <= 0:
|
| 719 |
return np.zeros((len(texts), 0), dtype=np.float32)
|
| 720 |
out = np.zeros((len(texts), dim), dtype=np.float32)
|
|
@@ -773,15 +721,8 @@ def cluster_labels_pmi_bigram(
|
|
| 773 |
topn=6,
|
| 774 |
subject_alpha=0.75,
|
| 775 |
global_ubiq_cut=0.20,
|
| 776 |
-
subject_min_cov=0.30
|
| 777 |
):
|
| 778 |
-
"""
|
| 779 |
-
Improved labeler:
|
| 780 |
-
- Considers bigrams AND trigrams (PMI vs. global)
|
| 781 |
-
- Class-TFIDF unigrams with subject coverage boost
|
| 782 |
-
- Suppresses globally ubiquitous tokens/phrases (appear in >20% docs by default)
|
| 783 |
-
- NEW: prefers subject terms that occur in ≥30% of cluster subjects (can be tuned via subject_min_cov)
|
| 784 |
-
"""
|
| 785 |
import math as _math
|
| 786 |
from collections import Counter, defaultdict
|
| 787 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
@@ -792,12 +733,11 @@ def cluster_labels_pmi_bigram(
|
|
| 792 |
def is_junk_token(tok: str) -> bool:
|
| 793 |
if _is_junk_term(tok): return True
|
| 794 |
tl = tok.lower()
|
| 795 |
-
if tl.startswith("__"): return True
|
| 796 |
if "@" in tl: return True
|
| 797 |
if tl.isascii() and len(tl) <= 2: return True
|
| 798 |
if LONG_ALNUM_RE.match(tok) or HEXISH_RE.match(tok) or DIGIT_HEAVY_RE.match(tok): return True
|
| 799 |
if len(tok) > 40: return True
|
| 800 |
-
# strip punctuation-heavy artifacts (URLs already replaced with 'URL')
|
| 801 |
if re.search(r"[^\w\-’']", tl): return True
|
| 802 |
return False
|
| 803 |
|
|
@@ -808,7 +748,6 @@ def cluster_labels_pmi_bigram(
|
|
| 808 |
def ngrams(toks, n):
|
| 809 |
return [" ".join(p) for p in zip(*[toks[i:] for i in range(n)]) if all(not is_junk_token(x) for x in p)]
|
| 810 |
|
| 811 |
-
# Compute global doc frequency for tokens, bigrams, trigrams
|
| 812 |
glob_df_uni = Counter()
|
| 813 |
glob_df_bg = Counter()
|
| 814 |
glob_df_tri = Counter()
|
|
@@ -822,23 +761,19 @@ def cluster_labels_pmi_bigram(
|
|
| 822 |
|
| 823 |
have_subjects = subjects is not None and len(subjects) == len(texts)
|
| 824 |
|
| 825 |
-
# Pre-pass: DF stats
|
| 826 |
for idx, (txt, c) in enumerate(zip(texts, labels)):
|
| 827 |
c = int(c)
|
| 828 |
toks = tokenize_clean(txt)
|
| 829 |
uni_set = set(toks)
|
| 830 |
bg_set = set(ngrams(toks, 2))
|
| 831 |
tri_set = set(ngrams(toks, 3))
|
| 832 |
-
# DF
|
| 833 |
glob_df_uni.update(uni_set)
|
| 834 |
glob_df_bg.update(bg_set)
|
| 835 |
glob_df_tri.update(tri_set)
|
| 836 |
-
# Per-cluster counts
|
| 837 |
per_c_bg[c].update(bg_set)
|
| 838 |
per_c_tri[c].update(tri_set)
|
| 839 |
per_c_texts[c].append(" ".join(toks))
|
| 840 |
per_c_doc_count[c] += 1
|
| 841 |
-
# Subject presence
|
| 842 |
if have_subjects:
|
| 843 |
stoks = tokenize_clean(subjects[idx] or "")
|
| 844 |
s_uni = set(stoks)
|
|
@@ -849,16 +784,13 @@ def cluster_labels_pmi_bigram(
|
|
| 849 |
per_c_subj_tri_docs[c].update(s_tri)
|
| 850 |
|
| 851 |
N = max(1, len(texts))
|
| 852 |
-
labels_out = {}
|
| 853 |
|
| 854 |
-
|
| 855 |
-
def too_ubiquitous(df_count): # fraction of docs
|
| 856 |
return (df_count / float(N)) > float(global_ubiq_cut)
|
| 857 |
|
|
|
|
| 858 |
for c in sorted(set(int(x) for x in labels)):
|
| 859 |
n_docs_c = max(1, per_c_doc_count[c])
|
| 860 |
-
|
| 861 |
-
# ===== PMI bigrams & trigrams with subject-coverage boost & ≥30% preference =====
|
| 862 |
phrases = []
|
| 863 |
for store, glob_df, subj_docs, n in (
|
| 864 |
(per_c_bg[c], glob_df_bg, per_c_subj_bg_docs[c], 2),
|
|
@@ -868,7 +800,7 @@ def cluster_labels_pmi_bigram(
|
|
| 868 |
total_g = sum(glob_df.values()) + 1e-12
|
| 869 |
scored = []
|
| 870 |
for ng, cnt in store.most_common(3000):
|
| 871 |
-
if too_ubiquitous(glob_df[ng]):
|
| 872 |
continue
|
| 873 |
p_ng_c = cnt / total_c
|
| 874 |
p_ng_g = (glob_df[ng] / total_g)
|
|
@@ -877,17 +809,14 @@ def cluster_labels_pmi_bigram(
|
|
| 877 |
cov = 0.0
|
| 878 |
if have_subjects:
|
| 879 |
cov = subj_docs[ng] / n_docs_c
|
| 880 |
-
# add a preference bump if subject coverage ≥ threshold
|
| 881 |
if cov >= subject_min_cov:
|
| 882 |
-
score += 0.6
|
| 883 |
score += subject_alpha * cov
|
| 884 |
scored.append((score, cov, ng))
|
| 885 |
-
# prefer subject-coverage ≥ threshold first, then highest score
|
| 886 |
scored.sort(key=lambda x: (x[1] >= subject_min_cov, x[0]), reverse=True)
|
| 887 |
take = max(1, topn // (3 if n == 3 else 2))
|
| 888 |
phrases.extend([p for _, _, p in scored[:take]])
|
| 889 |
|
| 890 |
-
# ===== Class-TFIDF unigrams with subject coverage boost & ≥30% preference =====
|
| 891 |
docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
|
| 892 |
docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
|
| 893 |
corpus = [docs_c[0], docs_bg[0]]
|
|
@@ -904,15 +833,13 @@ def cluster_labels_pmi_bigram(
|
|
| 904 |
vocab_index = {t:i for i,t in enumerate(vocab)}
|
| 905 |
if have_subjects:
|
| 906 |
for tok, cnt_docs in per_c_subj_uni_docs[c].items():
|
| 907 |
-
if tok in vocab_index and not
|
| 908 |
i = vocab_index[tok]
|
| 909 |
frac = cnt_docs / n_docs_c
|
| 910 |
subj_cov[i] = frac
|
| 911 |
subj_cov_frac[i] = frac
|
| 912 |
|
| 913 |
-
# base + subject alpha
|
| 914 |
row_boosted = row + subject_alpha * subj_cov
|
| 915 |
-
# final score gets a preference bump if subj coverage ≥ threshold (but not a hard filter)
|
| 916 |
pref_bump = (subj_cov_frac >= subject_min_cov).astype(row_boosted.dtype) * 0.6
|
| 917 |
final = row_boosted + pref_bump
|
| 918 |
|
|
@@ -920,8 +847,8 @@ def cluster_labels_pmi_bigram(
|
|
| 920 |
unis = []
|
| 921 |
for i in order:
|
| 922 |
tok = vocab[i]
|
| 923 |
-
if
|
| 924 |
-
if too_ubiquitous(glob_df_uni.get(tok, 0)):
|
| 925 |
continue
|
| 926 |
unis.append(tok)
|
| 927 |
if len(unis) >= max(0, topn - len(phrases)):
|
|
@@ -935,11 +862,9 @@ def cluster_labels_pmi_bigram(
|
|
| 935 |
# =================== Auto-k & merge ===================
|
| 936 |
def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
|
| 937 |
n = X.shape[0]
|
| 938 |
-
# NEW: tiny-partition guard
|
| 939 |
if n <= 1:
|
| 940 |
return 1, {1: 0.0}
|
| 941 |
if n < min(ks):
|
| 942 |
-
# pick a small, reasonable k for tiny n
|
| 943 |
k_small = max(2, min(10, n))
|
| 944 |
return int(k_small), {int(k_small): 0.0}
|
| 945 |
|
|
@@ -952,7 +877,7 @@ def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
|
|
| 952 |
inertias = []
|
| 953 |
for k in ks:
|
| 954 |
k = int(k)
|
| 955 |
-
if n < k:
|
| 956 |
break
|
| 957 |
km = MiniBatchKMeans(n_clusters=k, batch_size=4096, random_state=0, n_init="auto")
|
| 958 |
km.fit(Xs)
|
|
@@ -981,7 +906,7 @@ def merge_close_clusters(labels, centers, thresh=0.92):
|
|
| 981 |
while parent[a]!=a: a=parent[a]
|
| 982 |
return a
|
| 983 |
for i in range(k):
|
| 984 |
-
for j in range(i+1, k):
|
| 985 |
if sim[i,j] >= thresh:
|
| 986 |
pi, pj = find(i), find(j)
|
| 987 |
if pi!=pj: parent[pj]=pi
|
|
@@ -1024,7 +949,7 @@ def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVect
|
|
| 1024 |
return seeds_red
|
| 1025 |
return None
|
| 1026 |
|
| 1027 |
-
# =================== NEW: cluster stabilizer
|
| 1028 |
def _centroids_from_labels(X, labels):
|
| 1029 |
labs = np.asarray(labels, dtype=int)
|
| 1030 |
uniq = np.unique(labs)
|
|
@@ -1038,7 +963,6 @@ def _centroids_from_labels(X, labels):
|
|
| 1038 |
if n > 0: v = v / n
|
| 1039 |
cents[int(c)] = v.astype(np.float32)
|
| 1040 |
return cents
|
| 1041 |
-
# CSR sparse
|
| 1042 |
X = X.tocsr()
|
| 1043 |
for c in uniq:
|
| 1044 |
rows = np.where(labs == c)[0]
|
|
@@ -1054,7 +978,7 @@ def _cosine_sim_to_centroids(vecs, centroids):
|
|
| 1054 |
if not centroids:
|
| 1055 |
return None, None
|
| 1056 |
keys = list(centroids.keys())
|
| 1057 |
-
C = np.stack([centroids[k] for k in keys], axis=0)
|
| 1058 |
if isinstance(vecs, np.ndarray):
|
| 1059 |
sims = vecs @ C.T
|
| 1060 |
else:
|
|
@@ -1067,8 +991,6 @@ def _cosine_sim_to_centroids(vecs, centroids):
|
|
| 1067 |
|
| 1068 |
def stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35):
|
| 1069 |
labs = np.asarray(labels, dtype=int)
|
| 1070 |
-
|
| 1071 |
-
# 1) merge very close centroids
|
| 1072 |
cents = _centroids_from_labels(X_space, labs)
|
| 1073 |
keys = sorted([k for k in cents.keys() if k >= 0])
|
| 1074 |
if len(keys) >= 2:
|
|
@@ -1090,7 +1012,6 @@ def stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_t
|
|
| 1090 |
labs = np.array([merge_map.get(int(c), int(c)) for c in labs], dtype=int)
|
| 1091 |
cents = _centroids_from_labels(X_space, labs)
|
| 1092 |
|
| 1093 |
-
# 2) reassign tiny clusters to nearest big centroid (else noise -3)
|
| 1094 |
vc = pd.Series(labs).value_counts()
|
| 1095 |
big_labs = set(vc[vc >= int(min_size)].index.tolist())
|
| 1096 |
small_labs = set(vc[vc < int(min_size)].index.tolist())
|
|
@@ -1155,31 +1076,27 @@ def compute_context_anomaly(df_in: pd.DataFrame) -> pd.DataFrame:
|
|
| 1155 |
df_in["context_anomaly_score"] = 0.0
|
| 1156 |
return df_in
|
| 1157 |
|
| 1158 |
-
# 1) IsolationForest percentile -> 0–6 (you already computed anomaly_score per partition; lower is “more anomalous” if using score_samples with sign reversed above)
|
| 1159 |
df = df_in.copy()
|
| 1160 |
if "anomaly_score" in df.columns:
|
| 1161 |
-
# higher = more anomalous in your current pipeline (you negated score_samples). Convert to percentile per bucket.
|
| 1162 |
df["_if_pct"] = 0.0
|
| 1163 |
for bkt, grp in df.groupby("bucket", dropna=False):
|
| 1164 |
vals = grp["anomaly_score"].astype(float)
|
| 1165 |
if vals.notna().sum() >= 5:
|
| 1166 |
-
ranks = vals.rank(pct=True, ascending=False)
|
| 1167 |
df.loc[grp.index, "_if_pct"] = ranks.clip(0, 1)
|
| 1168 |
df["_if_pts"] = (df["_if_pct"] * 6.0).clip(0, 6)
|
| 1169 |
else:
|
| 1170 |
df["_if_pts"] = 0.0
|
| 1171 |
|
| 1172 |
-
# 2) Rule violations per bucket (0–2)
|
| 1173 |
df["_rule_pts"] = 0.0
|
| 1174 |
low = (df["subject"].fillna("") + " " + df["body_text"].fillna("")).str.lower()
|
| 1175 |
for bkt, terms in TAXONOMY.items():
|
| 1176 |
mask = (df["bucket"] == bkt)
|
| 1177 |
-
if not mask.any():
|
| 1178 |
continue
|
| 1179 |
if terms:
|
| 1180 |
has_term = low.str.contains("|".join([re.escape(t.lower()) for t in terms]), regex=True)
|
| 1181 |
df.loc[mask & (~has_term), "_rule_pts"] += 1.0
|
| 1182 |
-
# header expectation examples:
|
| 1183 |
if bkt == "Constituent":
|
| 1184 |
df.loc[mask & (~df["from_domain"].str.lower().isin(PERSONAL_DOMAINS)), "_rule_pts"] += 1.0
|
| 1185 |
if bkt == "Scheduling":
|
|
@@ -1187,8 +1104,6 @@ def compute_context_anomaly(df_in: pd.DataFrame) -> pd.DataFrame:
|
|
| 1187 |
df.loc[mask & (~subj.str.contains(r"\bmeeting|invite|schedule|calendar\b", regex=True)), "_rule_pts"] += 1.0
|
| 1188 |
|
| 1189 |
df["_rule_pts"] = df["_rule_pts"].clip(0, 2)
|
| 1190 |
-
|
| 1191 |
-
# 3) Corruption heuristics capped to 0–3
|
| 1192 |
df["_corr_pts"] = df["corruption_score"].fillna(0).clip(0, 3)
|
| 1193 |
|
| 1194 |
df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
|
|
@@ -1522,6 +1437,8 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
|
|
| 1522 |
|
| 1523 |
with gr.Row():
|
| 1524 |
run_btn = gr.Button("Process", variant="primary")
|
|
|
|
|
|
|
| 1525 |
reset_btn = gr.Button("Reset filters")
|
| 1526 |
status = gr.Markdown("")
|
| 1527 |
|
|
@@ -1561,8 +1478,9 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
|
|
| 1561 |
state_dims = gr.State()
|
| 1562 |
state_extra_terms = gr.State()
|
| 1563 |
state_highlight = gr.State()
|
|
|
|
| 1564 |
|
| 1565 |
-
|
| 1566 |
def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
|
| 1567 |
recs: List[Dict[str, Any]] = []
|
| 1568 |
if local_path.endswith(".jsonl"):
|
|
@@ -1945,7 +1863,13 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
|
|
| 1945 |
extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
|
| 1946 |
extra_terms_lower = [t.lower() for t in extra_terms]
|
| 1947 |
|
| 1948 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1949 |
if not recs:
|
| 1950 |
return ("**No valid records found.**",
|
| 1951 |
None, None, None, None, None,
|
|
@@ -2234,6 +2158,38 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
|
|
| 2234 |
state_extra_terms, state_highlight,
|
| 2235 |
bucket_drop,
|
| 2236 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2237 |
)
|
| 2238 |
|
| 2239 |
# -------- Filtering & Search --------
|
|
@@ -2403,4 +2359,3 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
|
|
| 2403 |
if __name__ == "__main__":
|
| 2404 |
# Disable SSR to avoid handler arity warnings under server-side rendering
|
| 2405 |
demo.launch(ssr_mode=False)
|
| 2406 |
-
|
|
|
|
| 72 |
"HR/Admin": ["hiring","personnel","payroll","benefits","policy","vacation","pto"],
|
| 73 |
"Constituent": ["constituent","concerned citizen","my issue","complaint","community"],
|
| 74 |
"Scheduling": ["schedule","meeting","appointment","calendar","invite","availability","reschedule"],
|
| 75 |
+
"Legal": ["legal","lawsuit","intake","attorney","counsel","privileged","court","subpoena","confidential"],
|
| 76 |
"IT/Security": ["password","account security","two-factor","2fa","vpn","verification code","security alert","it support"],
|
| 77 |
"Newsletters/Alerts": ["newsletter","daily briefing","news update","unsubscribe","press clip","digest"],
|
| 78 |
"Other": [],
|
|
|
|
| 94 |
if bucket == "IT/Security":
|
| 95 |
return 5.0 if is_notification_like(subj, row.get("body_text",""), row.get("from_email",""), fd) else 0.0
|
| 96 |
if bucket == "Constituent":
|
|
|
|
| 97 |
return 3.0 if (fd in PERSONAL_DOMAINS) else 0.0
|
| 98 |
if bucket == "Lobbyist":
|
| 99 |
return 5.0 if fd in LOBBY_DOMAINS else 0.0
|
| 100 |
if bucket == "Legal":
|
| 101 |
return 5.0 if (("law" in fd) or (fd in LEGAL_DOMAINS) or ("privileged" in subj.lower())) else 0.0
|
| 102 |
if bucket == "Scheduling":
|
|
|
|
| 103 |
body = (row.get("body_text") or "")
|
| 104 |
return 3.0 if (ATTACH_NAME_RE.search(" ".join(row.get("attachments") or [])) or re.search(r"\binvitation\b|\binvite\b", subj, re.I) or re.search(r"\.ics\b", body, re.I)) else 0.0
|
| 105 |
return 0.0
|
| 106 |
|
| 107 |
+
MIN_ROUTE_SCORE = 1.5
|
| 108 |
TIE_MARGIN = 1.0
|
| 109 |
|
| 110 |
def route_email_row(row: pd.Series) -> str:
|
| 111 |
text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
|
| 112 |
scores: dict = {b: 0.0 for b in TAXONOMY.keys()}
|
|
|
|
| 113 |
for b, terms in TAXONOMY.items():
|
| 114 |
+
if not terms:
|
| 115 |
continue
|
|
|
|
| 116 |
hits = sum(1 for t in terms if t and t.lower() in text)
|
| 117 |
scores[b] += float(hits)
|
|
|
|
| 118 |
if b in ("Lobbyist","Procurement") and any(p in text for p in SUSPECT_PHRASES):
|
| 119 |
scores[b] += 1.0
|
|
|
|
| 120 |
for b in TAXONOMY.keys():
|
| 121 |
scores[b] += _bucket_header_bonus(row, b)
|
|
|
|
| 122 |
best_bucket, best = max(scores.items(), key=lambda kv: kv[1])
|
| 123 |
second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
|
| 124 |
if best < MIN_ROUTE_SCORE or (best - second) < TIE_MARGIN:
|
|
|
|
| 140 |
|
| 141 |
# ==== Expanded corruption lexicon ====
|
| 142 |
SUSPECT_PHRASES = [
|
|
|
|
| 143 |
"off the books","cover up","kickback","bribe","under the table",
|
| 144 |
"no inspection","special fee","friendly payment","confidential deal",
|
| 145 |
"nobody will find out","pay to play","cash only","shell company",
|
| 146 |
"bid rigging","embezzle","slush fund","false invoice","ghost employee",
|
| 147 |
"contract splitting","grease payment","unreported","unrecorded",
|
|
|
|
| 148 |
"off the record","just between us","don’t quote me on this","dont quote me on this",
|
| 149 |
"we never had this conversation","keep this between us","not ethical","illegal",
|
| 150 |
"grey area","gray area","write off","failed investment","they owe it to me",
|
|
|
|
| 151 |
"let’s take this offline","lets take this offline","send to my gmail","send to my yahoo",
|
| 152 |
"don’t leave a trail","dont leave a trail","call my cell","text me","don’t text me","dont text me",
|
| 153 |
"tell you on the phone","talk in person","come by my office","vpn",
|
|
|
|
| 154 |
"tax haven","off-shore account","offshore account","backdate","pull earnings forward",
|
| 155 |
"delete this email","no inspection","special fees","wire instructions",
|
| 156 |
]
|
|
|
|
| 157 |
EVASIVE_ACRO_RE = re.compile(r'\b(?:TYOP|LDL|TOL|OTR|TXT|TYL)\b', re.I)
|
| 158 |
|
|
|
|
| 159 |
MONEY_RE = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
|
| 160 |
PHONE_RE = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
|
| 161 |
INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order|wire)\b', re.I)
|
| 162 |
COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
|
| 163 |
ATTACH_NAME_RE = re.compile(r'\b(agreement|contract|invoice|wire|payment|instructions|accounts?|offshore|tax|statement)\b', re.I)
|
| 164 |
|
|
|
|
| 165 |
OFFCHANNEL_PATTERNS = [
|
| 166 |
r"\bwhatsapp\b", r"\bsignal\b", r"\btelegram\b", r"\bwechat\b",
|
| 167 |
r"send to my (gmail|yahoo|protonmail)", r"(call|text) (me|my cell)",
|
|
|
|
| 170 |
]
|
| 171 |
OFFCHANNEL_RE = re.compile("|".join(OFFCHANNEL_PATTERNS), re.I)
|
| 172 |
|
|
|
|
| 173 |
PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
|
| 174 |
|
|
|
|
| 175 |
NEWS_DOMAINS = {"nytimes.com","ft.com","wsj.com","bloomberg.com","reuters.com","theguardian.com","economist.com"}
|
| 176 |
def is_news_like(subject: str, body: str, from_domain: str) -> bool:
|
| 177 |
s = (subject or "").lower()
|
|
|
|
| 182 |
if any(d in fd for d in NEWS_DOMAINS): return True
|
| 183 |
return False
|
| 184 |
|
|
|
|
| 185 |
NOTIFY_PATTERNS = [
|
| 186 |
r"\bno[-\s]?reply\b", r"do not reply", r"security alert", r"new sign[-\s]?in",
|
| 187 |
r"verification code", r"two[-\s]?factor", r"\b2fa\b", r"\botp\b", r"\bcode[:\s]",
|
| 188 |
r"itunes connect", r"apple id", r"your google account", r"used (?:a )?new browser",
|
| 189 |
r"unable to determine", r"reset your password", r"\balert\b",
|
|
|
|
| 190 |
r"mailer[-\s]?daemon", r"\bpostmaster\b", r"delivery status notification",
|
| 191 |
r"undeliverable", r"delivery failure", r"returned mail", r"mail delivery subsystem",
|
| 192 |
r"proofpoint", r"mimecast", r"dmarc", r"\bspf\b", r"\bdkim\b", r"quarantine",
|
|
|
|
| 203 |
return True
|
| 204 |
return False
|
| 205 |
|
|
|
|
| 206 |
HEB_RE = re.compile(r'[\u0590-\u05FF]')
|
| 207 |
AR_RE = re.compile(r'[\u0600-\u06FF]')
|
| 208 |
CYR_RE = re.compile(r'[\u0400-\u04FF]')
|
|
|
|
| 217 |
return "en"
|
| 218 |
return "unknown"
|
| 219 |
|
|
|
|
| 220 |
CORR_LEX = {
|
| 221 |
"kickback" : ["kickback","bribe","under the table","gift","cash"],
|
| 222 |
"invoice_fraud" : ["false invoice","ghost employee","contract splitting","slush fund","shell company","front company"],
|
|
|
|
| 238 |
"january","february","march","april","june","july","august","september",
|
| 239 |
"october","november","december"
|
| 240 |
}
|
|
|
|
|
|
|
| 241 |
STOP_TERMS = {
|
| 242 |
"div","span","nbsp","href","src","img","class","style","align","border","cid",
|
| 243 |
"content","content-type","multipart","alternative","quoted","printable","utf",
|
| 244 |
"windows-1255","iso-8859","us-ascii","html","plain","attachment","filename",
|
| 245 |
"type","id","service","person","generated","fyi"
|
| 246 |
}
|
|
|
|
|
|
|
| 247 |
AUX_STOP = {
|
| 248 |
"will","would","should","could","can","cant","cannot","did","do","does","done",
|
| 249 |
"have","has","had","having","get","got","make","made","let","need","want",
|
|
|
|
| 264 |
ZH_HEADER_STOP = {"发送时间","星期","星期一","星期二","星期三","星期四","星期五","星期六","星期日","转发","主题","收件人","发件人"}
|
| 265 |
HE_EXTRA_STOP = {"עם","או"}
|
| 266 |
|
|
|
|
| 267 |
STOP_TERMS |= AUX_STOP | CTA_STOP | TECH_META | ZH_HEADER_STOP | HE_EXTRA_STOP
|
| 268 |
EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
|
| 269 |
YEAR_RE = re.compile(r"^(19|20)\d{2}$")
|
| 270 |
NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
|
| 271 |
ONE_CHAR_RE = re.compile(r"^.$")
|
| 272 |
|
| 273 |
+
LONG_ALNUM_RE = re.compile(r"^[A-Za-z0-9_-]{24,}$")
|
| 274 |
+
HEXISH_RE = re.compile(r"^(?:[A-Fa-f0-9]{8,})$")
|
| 275 |
+
DIGIT_HEAVY_RE = re.compile(r"^(?:\D*\d){6,}\D*$")
|
|
|
|
| 276 |
UNDERSCORE_HEAVY_RE = re.compile(r"^[A-Za-z0-9]*_[A-Za-z0-9_]*$")
|
| 277 |
|
|
|
|
| 278 |
STOPWORD_FOR_VEC = sorted(EN_STOP | HE_STOP | STOP_TERMS)
|
| 279 |
|
| 280 |
def _is_junk_term(t: str) -> bool:
|
|
|
|
| 339 |
cut = idx if (cut is None or idx < cut) else cut
|
| 340 |
if cut is not None:
|
| 341 |
text = text[:cut]
|
|
|
|
| 342 |
text = re.sub(r"\n\s*sent from my .*?$", "", text, flags=re.I|re.M)
|
| 343 |
text = re.sub(r"\n\s*(נשלח מה-?iphone).*?$", "", text, flags=re.I|re.M)
|
| 344 |
return text.strip()
|
|
|
|
| 411 |
if str(raw.get("type", "")).lower() == "meta":
|
| 412 |
return {}
|
| 413 |
|
|
|
|
| 414 |
attach_names = []
|
| 415 |
atts = raw.get("attachments") or raw.get("Attachments") or raw.get("files") or []
|
| 416 |
if isinstance(atts, list):
|
|
|
|
| 461 |
|
| 462 |
subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
|
| 463 |
|
|
|
|
| 464 |
if use_langdetect:
|
| 465 |
try:
|
| 466 |
lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
|
|
|
|
| 532 |
df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
|
| 533 |
return df
|
| 534 |
|
|
|
|
| 535 |
def _compile_highlight_terms(row: pd.Series, extra_terms: List[str]) -> List[str]:
|
| 536 |
terms = []
|
| 537 |
txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
|
|
|
|
| 538 |
for p in SUSPECT_PHRASES:
|
| 539 |
if p in txt:
|
| 540 |
terms.append(p)
|
|
|
|
| 541 |
if MONEY_RE.search(txt): terms.append("$")
|
| 542 |
if INVOICE_RE.search(txt): terms.append("invoice")
|
|
|
|
| 543 |
if PHONE_RE.search(row.get("body_text","") or ""): terms.append("phone")
|
|
|
|
| 544 |
for t in extra_terms or []:
|
| 545 |
t=t.strip()
|
| 546 |
if t and t.lower() in txt:
|
| 547 |
terms.append(t)
|
|
|
|
| 548 |
out, seen = [], set()
|
| 549 |
for t in terms:
|
| 550 |
if t.lower() not in seen:
|
| 551 |
+
out.append(t); seen.add(t.lower())
|
|
|
|
| 552 |
return out[:24]
|
| 553 |
|
| 554 |
def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None,
|
|
|
|
| 566 |
hl_terms = []
|
| 567 |
if do_highlight:
|
| 568 |
hl_terms = (query_terms or []) + _compile_highlight_terms(row, extra_terms or [])
|
|
|
|
| 569 |
seen=set(); uniq=[]
|
| 570 |
for t in hl_terms:
|
| 571 |
tl=t.lower()
|
|
|
|
| 630 |
|
| 631 |
# ---------- Lightweight Embedding Utilities (Optional) ----------
|
| 632 |
def _load_embeddings(emb_path: str, binary: bool):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 633 |
if not GENSIM_OK or not emb_path or not os.path.exists(emb_path):
|
| 634 |
return None, 0
|
| 635 |
try:
|
|
|
|
| 639 |
kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=False)
|
| 640 |
return kv, int(kv.vector_size)
|
| 641 |
except Exception:
|
|
|
|
| 642 |
try:
|
| 643 |
kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=True)
|
| 644 |
return kv, int(kv.vector_size)
|
|
|
|
| 646 |
return None, 0
|
| 647 |
|
| 648 |
def _avg_embed_for_text(text: str, kv, dim: int) -> np.ndarray:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 649 |
vec = np.zeros((dim,), dtype=np.float32)
|
| 650 |
if not kv or not text:
|
| 651 |
return vec
|
|
|
|
| 657 |
cnt += 1
|
| 658 |
if cnt > 0:
|
| 659 |
vec /= float(cnt)
|
|
|
|
| 660 |
n = np.linalg.norm(vec)
|
| 661 |
if n > 0:
|
| 662 |
vec /= n
|
| 663 |
return vec
|
| 664 |
|
| 665 |
def _build_doc_embeddings(texts: List[str], kv, dim: int) -> np.ndarray:
|
|
|
|
|
|
|
|
|
|
| 666 |
if not kv or dim <= 0:
|
| 667 |
return np.zeros((len(texts), 0), dtype=np.float32)
|
| 668 |
out = np.zeros((len(texts), dim), dtype=np.float32)
|
|
|
|
| 721 |
topn=6,
|
| 722 |
subject_alpha=0.75,
|
| 723 |
global_ubiq_cut=0.20,
|
| 724 |
+
subject_min_cov=0.30
|
| 725 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
import math as _math
|
| 727 |
from collections import Counter, defaultdict
|
| 728 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
| 733 |
def is_junk_token(tok: str) -> bool:
|
| 734 |
if _is_junk_term(tok): return True
|
| 735 |
tl = tok.lower()
|
| 736 |
+
if tl.startswith("__"): return True
|
| 737 |
if "@" in tl: return True
|
| 738 |
if tl.isascii() and len(tl) <= 2: return True
|
| 739 |
if LONG_ALNUM_RE.match(tok) or HEXISH_RE.match(tok) or DIGIT_HEAVY_RE.match(tok): return True
|
| 740 |
if len(tok) > 40: return True
|
|
|
|
| 741 |
if re.search(r"[^\w\-’']", tl): return True
|
| 742 |
return False
|
| 743 |
|
|
|
|
| 748 |
def ngrams(toks, n):
|
| 749 |
return [" ".join(p) for p in zip(*[toks[i:] for i in range(n)]) if all(not is_junk_token(x) for x in p)]
|
| 750 |
|
|
|
|
| 751 |
glob_df_uni = Counter()
|
| 752 |
glob_df_bg = Counter()
|
| 753 |
glob_df_tri = Counter()
|
|
|
|
| 761 |
|
| 762 |
have_subjects = subjects is not None and len(subjects) == len(texts)
|
| 763 |
|
|
|
|
| 764 |
for idx, (txt, c) in enumerate(zip(texts, labels)):
|
| 765 |
c = int(c)
|
| 766 |
toks = tokenize_clean(txt)
|
| 767 |
uni_set = set(toks)
|
| 768 |
bg_set = set(ngrams(toks, 2))
|
| 769 |
tri_set = set(ngrams(toks, 3))
|
|
|
|
| 770 |
glob_df_uni.update(uni_set)
|
| 771 |
glob_df_bg.update(bg_set)
|
| 772 |
glob_df_tri.update(tri_set)
|
|
|
|
| 773 |
per_c_bg[c].update(bg_set)
|
| 774 |
per_c_tri[c].update(tri_set)
|
| 775 |
per_c_texts[c].append(" ".join(toks))
|
| 776 |
per_c_doc_count[c] += 1
|
|
|
|
| 777 |
if have_subjects:
|
| 778 |
stoks = tokenize_clean(subjects[idx] or "")
|
| 779 |
s_uni = set(stoks)
|
|
|
|
| 784 |
per_c_subj_tri_docs[c].update(s_tri)
|
| 785 |
|
| 786 |
N = max(1, len(texts))
|
|
|
|
| 787 |
|
| 788 |
+
def too_ubiquitous(df_count):
|
|
|
|
| 789 |
return (df_count / float(N)) > float(global_ubiq_cut)
|
| 790 |
|
| 791 |
+
labels_out = {}
|
| 792 |
for c in sorted(set(int(x) for x in labels)):
|
| 793 |
n_docs_c = max(1, per_c_doc_count[c])
|
|
|
|
|
|
|
| 794 |
phrases = []
|
| 795 |
for store, glob_df, subj_docs, n in (
|
| 796 |
(per_c_bg[c], glob_df_bg, per_c_subj_bg_docs[c], 2),
|
|
|
|
| 800 |
total_g = sum(glob_df.values()) + 1e-12
|
| 801 |
scored = []
|
| 802 |
for ng, cnt in store.most_common(3000):
|
| 803 |
+
if too_ubiquitous(glob_df[ng]):
|
| 804 |
continue
|
| 805 |
p_ng_c = cnt / total_c
|
| 806 |
p_ng_g = (glob_df[ng] / total_g)
|
|
|
|
| 809 |
cov = 0.0
|
| 810 |
if have_subjects:
|
| 811 |
cov = subj_docs[ng] / n_docs_c
|
|
|
|
| 812 |
if cov >= subject_min_cov:
|
| 813 |
+
score += 0.6
|
| 814 |
score += subject_alpha * cov
|
| 815 |
scored.append((score, cov, ng))
|
|
|
|
| 816 |
scored.sort(key=lambda x: (x[1] >= subject_min_cov, x[0]), reverse=True)
|
| 817 |
take = max(1, topn // (3 if n == 3 else 2))
|
| 818 |
phrases.extend([p for _, _, p in scored[:take]])
|
| 819 |
|
|
|
|
| 820 |
docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
|
| 821 |
docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
|
| 822 |
corpus = [docs_c[0], docs_bg[0]]
|
|
|
|
| 833 |
vocab_index = {t:i for i,t in enumerate(vocab)}
|
| 834 |
if have_subjects:
|
| 835 |
for tok, cnt_docs in per_c_subj_uni_docs[c].items():
|
| 836 |
+
if tok in vocab_index and not _is_junk_term(tok):
|
| 837 |
i = vocab_index[tok]
|
| 838 |
frac = cnt_docs / n_docs_c
|
| 839 |
subj_cov[i] = frac
|
| 840 |
subj_cov_frac[i] = frac
|
| 841 |
|
|
|
|
| 842 |
row_boosted = row + subject_alpha * subj_cov
|
|
|
|
| 843 |
pref_bump = (subj_cov_frac >= subject_min_cov).astype(row_boosted.dtype) * 0.6
|
| 844 |
final = row_boosted + pref_bump
|
| 845 |
|
|
|
|
| 847 |
unis = []
|
| 848 |
for i in order:
|
| 849 |
tok = vocab[i]
|
| 850 |
+
if _is_junk_term(tok): continue
|
| 851 |
+
if too_ubiquitous(glob_df_uni.get(tok, 0)):
|
| 852 |
continue
|
| 853 |
unis.append(tok)
|
| 854 |
if len(unis) >= max(0, topn - len(phrases)):
|
|
|
|
| 862 |
# =================== Auto-k & merge ===================
|
| 863 |
def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
|
| 864 |
n = X.shape[0]
|
|
|
|
| 865 |
if n <= 1:
|
| 866 |
return 1, {1: 0.0}
|
| 867 |
if n < min(ks):
|
|
|
|
| 868 |
k_small = max(2, min(10, n))
|
| 869 |
return int(k_small), {int(k_small): 0.0}
|
| 870 |
|
|
|
|
| 877 |
inertias = []
|
| 878 |
for k in ks:
|
| 879 |
k = int(k)
|
| 880 |
+
if n < k:
|
| 881 |
break
|
| 882 |
km = MiniBatchKMeans(n_clusters=k, batch_size=4096, random_state=0, n_init="auto")
|
| 883 |
km.fit(Xs)
|
|
|
|
| 906 |
while parent[a]!=a: a=parent[a]
|
| 907 |
return a
|
| 908 |
for i in range(k):
|
| 909 |
+
for j in range(i+1, k):
|
| 910 |
if sim[i,j] >= thresh:
|
| 911 |
pi, pj = find(i), find(j)
|
| 912 |
if pi!=pj: parent[pj]=pi
|
|
|
|
| 949 |
return seeds_red
|
| 950 |
return None
|
| 951 |
|
| 952 |
+
# =================== NEW: cluster stabilizer ===================
|
| 953 |
def _centroids_from_labels(X, labels):
|
| 954 |
labs = np.asarray(labels, dtype=int)
|
| 955 |
uniq = np.unique(labs)
|
|
|
|
| 963 |
if n > 0: v = v / n
|
| 964 |
cents[int(c)] = v.astype(np.float32)
|
| 965 |
return cents
|
|
|
|
| 966 |
X = X.tocsr()
|
| 967 |
for c in uniq:
|
| 968 |
rows = np.where(labs == c)[0]
|
|
|
|
| 978 |
if not centroids:
|
| 979 |
return None, None
|
| 980 |
keys = list(centroids.keys())
|
| 981 |
+
C = np.stack([centroids[k] for k in keys], axis=0)
|
| 982 |
if isinstance(vecs, np.ndarray):
|
| 983 |
sims = vecs @ C.T
|
| 984 |
else:
|
|
|
|
| 991 |
|
| 992 |
def stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35):
|
| 993 |
labs = np.asarray(labels, dtype=int)
|
|
|
|
|
|
|
| 994 |
cents = _centroids_from_labels(X_space, labs)
|
| 995 |
keys = sorted([k for k in cents.keys() if k >= 0])
|
| 996 |
if len(keys) >= 2:
|
|
|
|
| 1012 |
labs = np.array([merge_map.get(int(c), int(c)) for c in labs], dtype=int)
|
| 1013 |
cents = _centroids_from_labels(X_space, labs)
|
| 1014 |
|
|
|
|
| 1015 |
vc = pd.Series(labs).value_counts()
|
| 1016 |
big_labs = set(vc[vc >= int(min_size)].index.tolist())
|
| 1017 |
small_labs = set(vc[vc < int(min_size)].index.tolist())
|
|
|
|
| 1076 |
df_in["context_anomaly_score"] = 0.0
|
| 1077 |
return df_in
|
| 1078 |
|
|
|
|
| 1079 |
df = df_in.copy()
|
| 1080 |
if "anomaly_score" in df.columns:
|
|
|
|
| 1081 |
df["_if_pct"] = 0.0
|
| 1082 |
for bkt, grp in df.groupby("bucket", dropna=False):
|
| 1083 |
vals = grp["anomaly_score"].astype(float)
|
| 1084 |
if vals.notna().sum() >= 5:
|
| 1085 |
+
ranks = vals.rank(pct=True, ascending=False)
|
| 1086 |
df.loc[grp.index, "_if_pct"] = ranks.clip(0, 1)
|
| 1087 |
df["_if_pts"] = (df["_if_pct"] * 6.0).clip(0, 6)
|
| 1088 |
else:
|
| 1089 |
df["_if_pts"] = 0.0
|
| 1090 |
|
|
|
|
| 1091 |
df["_rule_pts"] = 0.0
|
| 1092 |
low = (df["subject"].fillna("") + " " + df["body_text"].fillna("")).str.lower()
|
| 1093 |
for bkt, terms in TAXONOMY.items():
|
| 1094 |
mask = (df["bucket"] == bkt)
|
| 1095 |
+
if not mask.any():
|
| 1096 |
continue
|
| 1097 |
if terms:
|
| 1098 |
has_term = low.str.contains("|".join([re.escape(t.lower()) for t in terms]), regex=True)
|
| 1099 |
df.loc[mask & (~has_term), "_rule_pts"] += 1.0
|
|
|
|
| 1100 |
if bkt == "Constituent":
|
| 1101 |
df.loc[mask & (~df["from_domain"].str.lower().isin(PERSONAL_DOMAINS)), "_rule_pts"] += 1.0
|
| 1102 |
if bkt == "Scheduling":
|
|
|
|
| 1104 |
df.loc[mask & (~subj.str.contains(r"\bmeeting|invite|schedule|calendar\b", regex=True)), "_rule_pts"] += 1.0
|
| 1105 |
|
| 1106 |
df["_rule_pts"] = df["_rule_pts"].clip(0, 2)
|
|
|
|
|
|
|
| 1107 |
df["_corr_pts"] = df["corruption_score"].fillna(0).clip(0, 3)
|
| 1108 |
|
| 1109 |
df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
|
|
|
|
| 1437 |
|
| 1438 |
with gr.Row():
|
| 1439 |
run_btn = gr.Button("Process", variant="primary")
|
| 1440 |
+
# NEW: Update button lets you re-run with same uploaded file & current settings
|
| 1441 |
+
update_btn = gr.Button("Update", variant="secondary") # NEW: Update
|
| 1442 |
reset_btn = gr.Button("Reset filters")
|
| 1443 |
status = gr.Markdown("")
|
| 1444 |
|
|
|
|
| 1478 |
state_dims = gr.State()
|
| 1479 |
state_extra_terms = gr.State()
|
| 1480 |
state_highlight = gr.State()
|
| 1481 |
+
state_inbox = gr.State() # NEW: keep last uploaded file for Update
|
| 1482 |
|
| 1483 |
+
# -------- IO helpers --------
|
| 1484 |
def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
|
| 1485 |
recs: List[Dict[str, Any]] = []
|
| 1486 |
if local_path.endswith(".jsonl"):
|
|
|
|
| 1863 |
extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
|
| 1864 |
extra_terms_lower = [t.lower() for t in extra_terms]
|
| 1865 |
|
| 1866 |
+
# Handle Gradio file object
|
| 1867 |
+
try:
|
| 1868 |
+
infile_path = inbox_file.name
|
| 1869 |
+
except Exception:
|
| 1870 |
+
infile_path = str(inbox_file) if inbox_file else ""
|
| 1871 |
+
|
| 1872 |
+
recs = _load_json_records(infile_path)
|
| 1873 |
if not recs:
|
| 1874 |
return ("**No valid records found.**",
|
| 1875 |
None, None, None, None, None,
|
|
|
|
| 2158 |
state_extra_terms, state_highlight,
|
| 2159 |
bucket_drop,
|
| 2160 |
],
|
| 2161 |
+
).then(
|
| 2162 |
+
# remember the uploaded file for future "Update" runs
|
| 2163 |
+
lambda f: f, inputs=[inbox_file], outputs=[state_inbox]
|
| 2164 |
+
)
|
| 2165 |
+
|
| 2166 |
+
# Keep state_inbox in sync whenever a new file is uploaded
|
| 2167 |
+
inbox_file.change(lambda f: f, inputs=[inbox_file], outputs=[state_inbox])
|
| 2168 |
+
|
| 2169 |
+
# NEW: Bind Update button — re-run with the last uploaded file + current settings
|
| 2170 |
+
update_btn.click(
|
| 2171 |
+
process_file,
|
| 2172 |
+
inputs=[
|
| 2173 |
+
state_inbox, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 2174 |
+
use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
|
| 2175 |
+
trusted_domains_in, extra_keywords_in, highlight_toggle,
|
| 2176 |
+
use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
|
| 2177 |
+
per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
|
| 2178 |
+
watchlist_in, min_mentions
|
| 2179 |
+
],
|
| 2180 |
+
outputs=[
|
| 2181 |
+
status, cluster_counts_df, domain_counts_df, sender_counts_df,
|
| 2182 |
+
actors_df, offhours_df,
|
| 2183 |
+
surv_entities_df, surv_samples_df,
|
| 2184 |
+
results_df,
|
| 2185 |
+
state_df, state_vec, state_X_reduced,
|
| 2186 |
+
state_index, state_term_names,
|
| 2187 |
+
state_use_lsa, state_use_faiss,
|
| 2188 |
+
cluster_drop, domain_drop, sender_drop, lang_drop,
|
| 2189 |
+
state_svd, state_norm, state_dims,
|
| 2190 |
+
state_extra_terms, state_highlight,
|
| 2191 |
+
bucket_drop,
|
| 2192 |
+
],
|
| 2193 |
)
|
| 2194 |
|
| 2195 |
# -------- Filtering & Search --------
|
|
|
|
| 2359 |
if __name__ == "__main__":
|
| 2360 |
# Disable SSR to avoid handler arity warnings under server-side rendering
|
| 2361 |
demo.launch(ssr_mode=False)
|
|
|