wuhp commited on
Commit
6af16b8
·
verified ·
1 Parent(s): 08fef30

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -110
app.py CHANGED
@@ -72,7 +72,7 @@ TAXONOMY = {
72
  "HR/Admin": ["hiring","personnel","payroll","benefits","policy","vacation","pto"],
73
  "Constituent": ["constituent","concerned citizen","my issue","complaint","community"],
74
  "Scheduling": ["schedule","meeting","appointment","calendar","invite","availability","reschedule"],
75
- "Legal": ["legal","lawsuit","attorney","counsel","privileged","court","subpoena","confidential"],
76
  "IT/Security": ["password","account security","two-factor","2fa","vpn","verification code","security alert","it support"],
77
  "Newsletters/Alerts": ["newsletter","daily briefing","news update","unsubscribe","press clip","digest"],
78
  "Other": [],
@@ -94,38 +94,31 @@ def _bucket_header_bonus(row: pd.Series, bucket: str) -> float:
94
  if bucket == "IT/Security":
95
  return 5.0 if is_notification_like(subj, row.get("body_text",""), row.get("from_email",""), fd) else 0.0
96
  if bucket == "Constituent":
97
- # personal mail to public office is a strong hint
98
  return 3.0 if (fd in PERSONAL_DOMAINS) else 0.0
99
  if bucket == "Lobbyist":
100
  return 5.0 if fd in LOBBY_DOMAINS else 0.0
101
  if bucket == "Legal":
102
  return 5.0 if (("law" in fd) or (fd in LEGAL_DOMAINS) or ("privileged" in subj.lower())) else 0.0
103
  if bucket == "Scheduling":
104
- # ICS invite or explicit invite subject
105
  body = (row.get("body_text") or "")
106
  return 3.0 if (ATTACH_NAME_RE.search(" ".join(row.get("attachments") or [])) or re.search(r"\binvitation\b|\binvite\b", subj, re.I) or re.search(r"\.ics\b", body, re.I)) else 0.0
107
  return 0.0
108
 
109
- MIN_ROUTE_SCORE = 1.5 # at least ~2 weak signals or one strong
110
  TIE_MARGIN = 1.0
111
 
112
  def route_email_row(row: pd.Series) -> str:
113
  text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
114
  scores: dict = {b: 0.0 for b in TAXONOMY.keys()}
115
- # lexicon points
116
  for b, terms in TAXONOMY.items():
117
- if not terms:
118
  continue
119
- # count unique term hits to avoid over-crediting repeats
120
  hits = sum(1 for t in terms if t and t.lower() in text)
121
  scores[b] += float(hits)
122
- # strong phrases in your corruption lexicon can hint Lobbyist/Procurement
123
  if b in ("Lobbyist","Procurement") and any(p in text for p in SUSPECT_PHRASES):
124
  scores[b] += 1.0
125
- # header bonuses
126
  for b in TAXONOMY.keys():
127
  scores[b] += _bucket_header_bonus(row, b)
128
- # choose
129
  best_bucket, best = max(scores.items(), key=lambda kv: kv[1])
130
  second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
131
  if best < MIN_ROUTE_SCORE or (best - second) < TIE_MARGIN:
@@ -147,35 +140,28 @@ SKIP_LANGDETECT = True
147
 
148
  # ==== Expanded corruption lexicon ====
149
  SUSPECT_PHRASES = [
150
- # core corruption/finance
151
  "off the books","cover up","kickback","bribe","under the table",
152
  "no inspection","special fee","friendly payment","confidential deal",
153
  "nobody will find out","pay to play","cash only","shell company",
154
  "bid rigging","embezzle","slush fund","false invoice","ghost employee",
155
  "contract splitting","grease payment","unreported","unrecorded",
156
- # secrecy/evasion
157
  "off the record","just between us","don’t quote me on this","dont quote me on this",
158
  "we never had this conversation","keep this between us","not ethical","illegal",
159
  "grey area","gray area","write off","failed investment","they owe it to me",
160
- # off-channel comms
161
  "let’s take this offline","lets take this offline","send to my gmail","send to my yahoo",
162
  "don’t leave a trail","dont leave a trail","call my cell","text me","don’t text me","dont text me",
163
  "tell you on the phone","talk in person","come by my office","vpn",
164
- # financial secrecy & accounting games
165
  "tax haven","off-shore account","offshore account","backdate","pull earnings forward",
166
  "delete this email","no inspection","special fees","wire instructions",
167
  ]
168
- # Evasive acronyms / slang (case-insensitive)
169
  EVASIVE_ACRO_RE = re.compile(r'\b(?:TYOP|LDL|TOL|OTR|TXT|TYL)\b', re.I)
170
 
171
- # Entity regexes
172
  MONEY_RE = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
173
  PHONE_RE = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
174
  INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order|wire)\b', re.I)
175
  COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
176
  ATTACH_NAME_RE = re.compile(r'\b(agreement|contract|invoice|wire|payment|instructions|accounts?|offshore|tax|statement)\b', re.I)
177
 
178
- # Off-channel patterns (apps / phrases)
179
  OFFCHANNEL_PATTERNS = [
180
  r"\bwhatsapp\b", r"\bsignal\b", r"\btelegram\b", r"\bwechat\b",
181
  r"send to my (gmail|yahoo|protonmail)", r"(call|text) (me|my cell)",
@@ -184,10 +170,8 @@ OFFCHANNEL_PATTERNS = [
184
  ]
185
  OFFCHANNEL_RE = re.compile("|".join(OFFCHANNEL_PATTERNS), re.I)
186
 
187
- # Common personal mail domains (used with user-specified trusted org domains)
188
  PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
189
 
190
- # Newsletter/newswire heuristics
191
  NEWS_DOMAINS = {"nytimes.com","ft.com","wsj.com","bloomberg.com","reuters.com","theguardian.com","economist.com"}
192
  def is_news_like(subject: str, body: str, from_domain: str) -> bool:
193
  s = (subject or "").lower()
@@ -198,13 +182,11 @@ def is_news_like(subject: str, body: str, from_domain: str) -> bool:
198
  if any(d in fd for d in NEWS_DOMAINS): return True
199
  return False
200
 
201
- # -------- System/notification heuristics (bucket as cluster -2) --------
202
  NOTIFY_PATTERNS = [
203
  r"\bno[-\s]?reply\b", r"do not reply", r"security alert", r"new sign[-\s]?in",
204
  r"verification code", r"two[-\s]?factor", r"\b2fa\b", r"\botp\b", r"\bcode[:\s]",
205
  r"itunes connect", r"apple id", r"your google account", r"used (?:a )?new browser",
206
  r"unable to determine", r"reset your password", r"\balert\b",
207
- # bounces / gateways / quarantine
208
  r"mailer[-\s]?daemon", r"\bpostmaster\b", r"delivery status notification",
209
  r"undeliverable", r"delivery failure", r"returned mail", r"mail delivery subsystem",
210
  r"proofpoint", r"mimecast", r"dmarc", r"\bspf\b", r"\bdkim\b", r"quarantine",
@@ -221,7 +203,6 @@ def is_notification_like(subject: str, body: str, from_email: str, from_domain:
221
  return True
222
  return False
223
 
224
- # -------- Fast language heuristic (used when skipping langdetect or on failure) --------
225
  HEB_RE = re.compile(r'[\u0590-\u05FF]')
226
  AR_RE = re.compile(r'[\u0600-\u06FF]')
227
  CYR_RE = re.compile(r'[\u0400-\u04FF]')
@@ -236,7 +217,6 @@ def fast_lang_heuristic(text: str) -> str:
236
  return "en"
237
  return "unknown"
238
 
239
- # Optional seeded themes for semi-supervised init (used only when LSA is ON)
240
  CORR_LEX = {
241
  "kickback" : ["kickback","bribe","under the table","gift","cash"],
242
  "invoice_fraud" : ["false invoice","ghost employee","contract splitting","slush fund","shell company","front company"],
@@ -258,16 +238,12 @@ MONTHS = {
258
  "january","february","march","april","june","july","august","september",
259
  "october","november","december"
260
  }
261
-
262
- # Extra junk/HTML/MIME terms to suppress in labels (expanded)
263
  STOP_TERMS = {
264
  "div","span","nbsp","href","src","img","class","style","align","border","cid",
265
  "content","content-type","multipart","alternative","quoted","printable","utf",
266
  "windows-1255","iso-8859","us-ascii","html","plain","attachment","filename",
267
  "type","id","service","person","generated","fyi"
268
  }
269
-
270
- # NEW: broader stop buckets for labels *and* features
271
  AUX_STOP = {
272
  "will","would","should","could","can","cant","cannot","did","do","does","done",
273
  "have","has","had","having","get","got","make","made","let","need","want",
@@ -288,20 +264,17 @@ TECH_META = {
288
  ZH_HEADER_STOP = {"发送时间","星期","星期一","星期二","星期三","星期四","星期五","星期六","星期日","转发","主题","收件人","发件人"}
289
  HE_EXTRA_STOP = {"עם","או"}
290
 
291
- # fold into STOP_TERMS and build a vectorizer stoplist
292
  STOP_TERMS |= AUX_STOP | CTA_STOP | TECH_META | ZH_HEADER_STOP | HE_EXTRA_STOP
293
  EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
294
  YEAR_RE = re.compile(r"^(19|20)\d{2}$")
295
  NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
296
  ONE_CHAR_RE = re.compile(r"^.$")
297
 
298
- # ---- NEW junk token guards (base64/hex/trackers/very-long) ----
299
- LONG_ALNUM_RE = re.compile(r"^[A-Za-z0-9_-]{24,}$") # long tracking/b64-ish
300
- HEXISH_RE = re.compile(r"^(?:[A-Fa-f0-9]{8,})$") # long hex blobs
301
- DIGIT_HEAVY_RE = re.compile(r"^(?:\D*\d){6,}\D*$") # too many digits
302
  UNDERSCORE_HEAVY_RE = re.compile(r"^[A-Za-z0-9]*_[A-Za-z0-9_]*$")
303
 
304
- # This stoplist is used by the CountVectorizer (MUST be list for sklearn)
305
  STOPWORD_FOR_VEC = sorted(EN_STOP | HE_STOP | STOP_TERMS)
306
 
307
  def _is_junk_term(t: str) -> bool:
@@ -366,7 +339,6 @@ def strip_quotes_and_sigs(text: str) -> str:
366
  cut = idx if (cut is None or idx < cut) else cut
367
  if cut is not None:
368
  text = text[:cut]
369
- # extra safety for mobile signatures that sneak through
370
  text = re.sub(r"\n\s*sent from my .*?$", "", text, flags=re.I|re.M)
371
  text = re.sub(r"\n\s*(נשלח מה-?iphone).*?$", "", text, flags=re.I|re.M)
372
  return text.strip()
@@ -439,7 +411,6 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
439
  if str(raw.get("type", "")).lower() == "meta":
440
  return {}
441
 
442
- # attachments (names); accept common schemas
443
  attach_names = []
444
  atts = raw.get("attachments") or raw.get("Attachments") or raw.get("files") or []
445
  if isinstance(atts, list):
@@ -490,7 +461,6 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
490
 
491
  subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
492
 
493
- # Language (use fast heuristic if skipping or detector fails)
494
  if use_langdetect:
495
  try:
496
  lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
@@ -562,30 +532,23 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
562
  df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
563
  return df
564
 
565
- # Visual highlight helpers
566
  def _compile_highlight_terms(row: pd.Series, extra_terms: List[str]) -> List[str]:
567
  terms = []
568
  txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
569
- # suspect phrases found in this row
570
  for p in SUSPECT_PHRASES:
571
  if p in txt:
572
  terms.append(p)
573
- # entity markers
574
  if MONEY_RE.search(txt): terms.append("$")
575
  if INVOICE_RE.search(txt): terms.append("invoice")
576
- # regex-based (keep as literal samples)
577
  if PHONE_RE.search(row.get("body_text","") or ""): terms.append("phone")
578
- # extras from user input
579
  for t in extra_terms or []:
580
  t=t.strip()
581
  if t and t.lower() in txt:
582
  terms.append(t)
583
- # dedupe
584
  out, seen = [], set()
585
  for t in terms:
586
  if t.lower() not in seen:
587
- out.append(t)
588
- seen.add(t.lower())
589
  return out[:24]
590
 
591
  def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None,
@@ -603,7 +566,6 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
603
  hl_terms = []
604
  if do_highlight:
605
  hl_terms = (query_terms or []) + _compile_highlight_terms(row, extra_terms or [])
606
- # make unique, case-insensitive
607
  seen=set(); uniq=[]
608
  for t in hl_terms:
609
  tl=t.lower()
@@ -668,11 +630,6 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
668
 
669
  # ---------- Lightweight Embedding Utilities (Optional) ----------
670
  def _load_embeddings(emb_path: str, binary: bool):
671
- """
672
- Load word vectors with Gensim if available.
673
- Accepts word2vec binary (.bin) or text formats (.txt/.vec).
674
- Returns (model, dim) or (None, 0) if not available.
675
- """
676
  if not GENSIM_OK or not emb_path or not os.path.exists(emb_path):
677
  return None, 0
678
  try:
@@ -682,7 +639,6 @@ def _load_embeddings(emb_path: str, binary: bool):
682
  kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=False)
683
  return kv, int(kv.vector_size)
684
  except Exception:
685
- # Attempt GloVe-like with headerless text
686
  try:
687
  kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=True)
688
  return kv, int(kv.vector_size)
@@ -690,10 +646,6 @@ def _load_embeddings(emb_path: str, binary: bool):
690
  return None, 0
691
 
692
  def _avg_embed_for_text(text: str, kv, dim: int) -> np.ndarray:
693
- """
694
- Average embeddings over tokens matched by TOKEN_PATTERN.
695
- Returns zero vector if nothing matches or kv is None.
696
- """
697
  vec = np.zeros((dim,), dtype=np.float32)
698
  if not kv or not text:
699
  return vec
@@ -705,16 +657,12 @@ def _avg_embed_for_text(text: str, kv, dim: int) -> np.ndarray:
705
  cnt += 1
706
  if cnt > 0:
707
  vec /= float(cnt)
708
- # L2-normalize
709
  n = np.linalg.norm(vec)
710
  if n > 0:
711
  vec /= n
712
  return vec
713
 
714
  def _build_doc_embeddings(texts: List[str], kv, dim: int) -> np.ndarray:
715
- """
716
- Build [n_docs, dim] dense matrix of averaged embeddings.
717
- """
718
  if not kv or dim <= 0:
719
  return np.zeros((len(texts), 0), dtype=np.float32)
720
  out = np.zeros((len(texts), dim), dtype=np.float32)
@@ -773,15 +721,8 @@ def cluster_labels_pmi_bigram(
773
  topn=6,
774
  subject_alpha=0.75,
775
  global_ubiq_cut=0.20,
776
- subject_min_cov=0.30 # NEW: prefer subject terms that appear in ≥30% of a cluster's subjects
777
  ):
778
- """
779
- Improved labeler:
780
- - Considers bigrams AND trigrams (PMI vs. global)
781
- - Class-TFIDF unigrams with subject coverage boost
782
- - Suppresses globally ubiquitous tokens/phrases (appear in >20% docs by default)
783
- - NEW: prefers subject terms that occur in ≥30% of cluster subjects (can be tuned via subject_min_cov)
784
- """
785
  import math as _math
786
  from collections import Counter, defaultdict
787
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -792,12 +733,11 @@ def cluster_labels_pmi_bigram(
792
  def is_junk_token(tok: str) -> bool:
793
  if _is_junk_term(tok): return True
794
  tl = tok.lower()
795
- if tl.startswith("__"): return True # our feature flags
796
  if "@" in tl: return True
797
  if tl.isascii() and len(tl) <= 2: return True
798
  if LONG_ALNUM_RE.match(tok) or HEXISH_RE.match(tok) or DIGIT_HEAVY_RE.match(tok): return True
799
  if len(tok) > 40: return True
800
- # strip punctuation-heavy artifacts (URLs already replaced with 'URL')
801
  if re.search(r"[^\w\-’']", tl): return True
802
  return False
803
 
@@ -808,7 +748,6 @@ def cluster_labels_pmi_bigram(
808
  def ngrams(toks, n):
809
  return [" ".join(p) for p in zip(*[toks[i:] for i in range(n)]) if all(not is_junk_token(x) for x in p)]
810
 
811
- # Compute global doc frequency for tokens, bigrams, trigrams
812
  glob_df_uni = Counter()
813
  glob_df_bg = Counter()
814
  glob_df_tri = Counter()
@@ -822,23 +761,19 @@ def cluster_labels_pmi_bigram(
822
 
823
  have_subjects = subjects is not None and len(subjects) == len(texts)
824
 
825
- # Pre-pass: DF stats
826
  for idx, (txt, c) in enumerate(zip(texts, labels)):
827
  c = int(c)
828
  toks = tokenize_clean(txt)
829
  uni_set = set(toks)
830
  bg_set = set(ngrams(toks, 2))
831
  tri_set = set(ngrams(toks, 3))
832
- # DF
833
  glob_df_uni.update(uni_set)
834
  glob_df_bg.update(bg_set)
835
  glob_df_tri.update(tri_set)
836
- # Per-cluster counts
837
  per_c_bg[c].update(bg_set)
838
  per_c_tri[c].update(tri_set)
839
  per_c_texts[c].append(" ".join(toks))
840
  per_c_doc_count[c] += 1
841
- # Subject presence
842
  if have_subjects:
843
  stoks = tokenize_clean(subjects[idx] or "")
844
  s_uni = set(stoks)
@@ -849,16 +784,13 @@ def cluster_labels_pmi_bigram(
849
  per_c_subj_tri_docs[c].update(s_tri)
850
 
851
  N = max(1, len(texts))
852
- labels_out = {}
853
 
854
- # Helper: ubiquity filter
855
- def too_ubiquitous(df_count): # fraction of docs
856
  return (df_count / float(N)) > float(global_ubiq_cut)
857
 
 
858
  for c in sorted(set(int(x) for x in labels)):
859
  n_docs_c = max(1, per_c_doc_count[c])
860
-
861
- # ===== PMI bigrams & trigrams with subject-coverage boost & ≥30% preference =====
862
  phrases = []
863
  for store, glob_df, subj_docs, n in (
864
  (per_c_bg[c], glob_df_bg, per_c_subj_bg_docs[c], 2),
@@ -868,7 +800,7 @@ def cluster_labels_pmi_bigram(
868
  total_g = sum(glob_df.values()) + 1e-12
869
  scored = []
870
  for ng, cnt in store.most_common(3000):
871
- if too_ubiquitous(glob_df[ng]): # skip ubiquitous n-grams
872
  continue
873
  p_ng_c = cnt / total_c
874
  p_ng_g = (glob_df[ng] / total_g)
@@ -877,17 +809,14 @@ def cluster_labels_pmi_bigram(
877
  cov = 0.0
878
  if have_subjects:
879
  cov = subj_docs[ng] / n_docs_c
880
- # add a preference bump if subject coverage ≥ threshold
881
  if cov >= subject_min_cov:
882
- score += 0.6 # modest, ensures such terms bubble up
883
  score += subject_alpha * cov
884
  scored.append((score, cov, ng))
885
- # prefer subject-coverage ≥ threshold first, then highest score
886
  scored.sort(key=lambda x: (x[1] >= subject_min_cov, x[0]), reverse=True)
887
  take = max(1, topn // (3 if n == 3 else 2))
888
  phrases.extend([p for _, _, p in scored[:take]])
889
 
890
- # ===== Class-TFIDF unigrams with subject coverage boost & ≥30% preference =====
891
  docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
892
  docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
893
  corpus = [docs_c[0], docs_bg[0]]
@@ -904,15 +833,13 @@ def cluster_labels_pmi_bigram(
904
  vocab_index = {t:i for i,t in enumerate(vocab)}
905
  if have_subjects:
906
  for tok, cnt_docs in per_c_subj_uni_docs[c].items():
907
- if tok in vocab_index and not is_junk_token(tok):
908
  i = vocab_index[tok]
909
  frac = cnt_docs / n_docs_c
910
  subj_cov[i] = frac
911
  subj_cov_frac[i] = frac
912
 
913
- # base + subject alpha
914
  row_boosted = row + subject_alpha * subj_cov
915
- # final score gets a preference bump if subj coverage ≥ threshold (but not a hard filter)
916
  pref_bump = (subj_cov_frac >= subject_min_cov).astype(row_boosted.dtype) * 0.6
917
  final = row_boosted + pref_bump
918
 
@@ -920,8 +847,8 @@ def cluster_labels_pmi_bigram(
920
  unis = []
921
  for i in order:
922
  tok = vocab[i]
923
- if is_junk_token(tok): continue
924
- if too_ubiquitous(glob_df_uni.get(tok, 0)): # suppress ubiquitous tokens
925
  continue
926
  unis.append(tok)
927
  if len(unis) >= max(0, topn - len(phrases)):
@@ -935,11 +862,9 @@ def cluster_labels_pmi_bigram(
935
  # =================== Auto-k & merge ===================
936
  def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
937
  n = X.shape[0]
938
- # NEW: tiny-partition guard
939
  if n <= 1:
940
  return 1, {1: 0.0}
941
  if n < min(ks):
942
- # pick a small, reasonable k for tiny n
943
  k_small = max(2, min(10, n))
944
  return int(k_small), {int(k_small): 0.0}
945
 
@@ -952,7 +877,7 @@ def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
952
  inertias = []
953
  for k in ks:
954
  k = int(k)
955
- if n < k: # extra safety
956
  break
957
  km = MiniBatchKMeans(n_clusters=k, batch_size=4096, random_state=0, n_init="auto")
958
  km.fit(Xs)
@@ -981,7 +906,7 @@ def merge_close_clusters(labels, centers, thresh=0.92):
981
  while parent[a]!=a: a=parent[a]
982
  return a
983
  for i in range(k):
984
- for j in range(i+1, k): # fixed loop bound
985
  if sim[i,j] >= thresh:
986
  pi, pj = find(i), find(j)
987
  if pi!=pj: parent[pj]=pi
@@ -1024,7 +949,7 @@ def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVect
1024
  return seeds_red
1025
  return None
1026
 
1027
- # =================== NEW: cluster stabilizer (merge near-dupes + reassign tiny → big; else NOISE=-3) ===================
1028
  def _centroids_from_labels(X, labels):
1029
  labs = np.asarray(labels, dtype=int)
1030
  uniq = np.unique(labs)
@@ -1038,7 +963,6 @@ def _centroids_from_labels(X, labels):
1038
  if n > 0: v = v / n
1039
  cents[int(c)] = v.astype(np.float32)
1040
  return cents
1041
- # CSR sparse
1042
  X = X.tocsr()
1043
  for c in uniq:
1044
  rows = np.where(labs == c)[0]
@@ -1054,7 +978,7 @@ def _cosine_sim_to_centroids(vecs, centroids):
1054
  if not centroids:
1055
  return None, None
1056
  keys = list(centroids.keys())
1057
- C = np.stack([centroids[k] for k in keys], axis=0) # (k,d)
1058
  if isinstance(vecs, np.ndarray):
1059
  sims = vecs @ C.T
1060
  else:
@@ -1067,8 +991,6 @@ def _cosine_sim_to_centroids(vecs, centroids):
1067
 
1068
  def stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35):
1069
  labs = np.asarray(labels, dtype=int)
1070
-
1071
- # 1) merge very close centroids
1072
  cents = _centroids_from_labels(X_space, labs)
1073
  keys = sorted([k for k in cents.keys() if k >= 0])
1074
  if len(keys) >= 2:
@@ -1090,7 +1012,6 @@ def stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_t
1090
  labs = np.array([merge_map.get(int(c), int(c)) for c in labs], dtype=int)
1091
  cents = _centroids_from_labels(X_space, labs)
1092
 
1093
- # 2) reassign tiny clusters to nearest big centroid (else noise -3)
1094
  vc = pd.Series(labs).value_counts()
1095
  big_labs = set(vc[vc >= int(min_size)].index.tolist())
1096
  small_labs = set(vc[vc < int(min_size)].index.tolist())
@@ -1155,31 +1076,27 @@ def compute_context_anomaly(df_in: pd.DataFrame) -> pd.DataFrame:
1155
  df_in["context_anomaly_score"] = 0.0
1156
  return df_in
1157
 
1158
- # 1) IsolationForest percentile -> 0–6 (you already computed anomaly_score per partition; lower is “more anomalous” if using score_samples with sign reversed above)
1159
  df = df_in.copy()
1160
  if "anomaly_score" in df.columns:
1161
- # higher = more anomalous in your current pipeline (you negated score_samples). Convert to percentile per bucket.
1162
  df["_if_pct"] = 0.0
1163
  for bkt, grp in df.groupby("bucket", dropna=False):
1164
  vals = grp["anomaly_score"].astype(float)
1165
  if vals.notna().sum() >= 5:
1166
- ranks = vals.rank(pct=True, ascending=False) # top anomaly gets 1.0
1167
  df.loc[grp.index, "_if_pct"] = ranks.clip(0, 1)
1168
  df["_if_pts"] = (df["_if_pct"] * 6.0).clip(0, 6)
1169
  else:
1170
  df["_if_pts"] = 0.0
1171
 
1172
- # 2) Rule violations per bucket (0–2)
1173
  df["_rule_pts"] = 0.0
1174
  low = (df["subject"].fillna("") + " " + df["body_text"].fillna("")).str.lower()
1175
  for bkt, terms in TAXONOMY.items():
1176
  mask = (df["bucket"] == bkt)
1177
- if not mask.any():
1178
  continue
1179
  if terms:
1180
  has_term = low.str.contains("|".join([re.escape(t.lower()) for t in terms]), regex=True)
1181
  df.loc[mask & (~has_term), "_rule_pts"] += 1.0
1182
- # header expectation examples:
1183
  if bkt == "Constituent":
1184
  df.loc[mask & (~df["from_domain"].str.lower().isin(PERSONAL_DOMAINS)), "_rule_pts"] += 1.0
1185
  if bkt == "Scheduling":
@@ -1187,8 +1104,6 @@ def compute_context_anomaly(df_in: pd.DataFrame) -> pd.DataFrame:
1187
  df.loc[mask & (~subj.str.contains(r"\bmeeting|invite|schedule|calendar\b", regex=True)), "_rule_pts"] += 1.0
1188
 
1189
  df["_rule_pts"] = df["_rule_pts"].clip(0, 2)
1190
-
1191
- # 3) Corruption heuristics capped to 0–3
1192
  df["_corr_pts"] = df["corruption_score"].fillna(0).clip(0, 3)
1193
 
1194
  df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
@@ -1522,6 +1437,8 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
1522
 
1523
  with gr.Row():
1524
  run_btn = gr.Button("Process", variant="primary")
 
 
1525
  reset_btn = gr.Button("Reset filters")
1526
  status = gr.Markdown("")
1527
 
@@ -1561,8 +1478,9 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
1561
  state_dims = gr.State()
1562
  state_extra_terms = gr.State()
1563
  state_highlight = gr.State()
 
1564
 
1565
- # -------- IO helpers --------
1566
  def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
1567
  recs: List[Dict[str, Any]] = []
1568
  if local_path.endswith(".jsonl"):
@@ -1945,7 +1863,13 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
1945
  extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
1946
  extra_terms_lower = [t.lower() for t in extra_terms]
1947
 
1948
- recs = _load_json_records(inbox_file.name)
 
 
 
 
 
 
1949
  if not recs:
1950
  return ("**No valid records found.**",
1951
  None, None, None, None, None,
@@ -2234,6 +2158,38 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
2234
  state_extra_terms, state_highlight,
2235
  bucket_drop,
2236
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2237
  )
2238
 
2239
  # -------- Filtering & Search --------
@@ -2403,4 +2359,3 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
2403
  if __name__ == "__main__":
2404
  # Disable SSR to avoid handler arity warnings under server-side rendering
2405
  demo.launch(ssr_mode=False)
2406
-
 
72
  "HR/Admin": ["hiring","personnel","payroll","benefits","policy","vacation","pto"],
73
  "Constituent": ["constituent","concerned citizen","my issue","complaint","community"],
74
  "Scheduling": ["schedule","meeting","appointment","calendar","invite","availability","reschedule"],
75
+ "Legal": ["legal","lawsuit","intake","attorney","counsel","privileged","court","subpoena","confidential"],
76
  "IT/Security": ["password","account security","two-factor","2fa","vpn","verification code","security alert","it support"],
77
  "Newsletters/Alerts": ["newsletter","daily briefing","news update","unsubscribe","press clip","digest"],
78
  "Other": [],
 
94
  if bucket == "IT/Security":
95
  return 5.0 if is_notification_like(subj, row.get("body_text",""), row.get("from_email",""), fd) else 0.0
96
  if bucket == "Constituent":
 
97
  return 3.0 if (fd in PERSONAL_DOMAINS) else 0.0
98
  if bucket == "Lobbyist":
99
  return 5.0 if fd in LOBBY_DOMAINS else 0.0
100
  if bucket == "Legal":
101
  return 5.0 if (("law" in fd) or (fd in LEGAL_DOMAINS) or ("privileged" in subj.lower())) else 0.0
102
  if bucket == "Scheduling":
 
103
  body = (row.get("body_text") or "")
104
  return 3.0 if (ATTACH_NAME_RE.search(" ".join(row.get("attachments") or [])) or re.search(r"\binvitation\b|\binvite\b", subj, re.I) or re.search(r"\.ics\b", body, re.I)) else 0.0
105
  return 0.0
106
 
107
+ MIN_ROUTE_SCORE = 1.5
108
  TIE_MARGIN = 1.0
109
 
110
  def route_email_row(row: pd.Series) -> str:
111
  text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
112
  scores: dict = {b: 0.0 for b in TAXONOMY.keys()}
 
113
  for b, terms in TAXONOMY.items():
114
+ if not terms:
115
  continue
 
116
  hits = sum(1 for t in terms if t and t.lower() in text)
117
  scores[b] += float(hits)
 
118
  if b in ("Lobbyist","Procurement") and any(p in text for p in SUSPECT_PHRASES):
119
  scores[b] += 1.0
 
120
  for b in TAXONOMY.keys():
121
  scores[b] += _bucket_header_bonus(row, b)
 
122
  best_bucket, best = max(scores.items(), key=lambda kv: kv[1])
123
  second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
124
  if best < MIN_ROUTE_SCORE or (best - second) < TIE_MARGIN:
 
140
 
141
  # ==== Expanded corruption lexicon ====
142
  SUSPECT_PHRASES = [
 
143
  "off the books","cover up","kickback","bribe","under the table",
144
  "no inspection","special fee","friendly payment","confidential deal",
145
  "nobody will find out","pay to play","cash only","shell company",
146
  "bid rigging","embezzle","slush fund","false invoice","ghost employee",
147
  "contract splitting","grease payment","unreported","unrecorded",
 
148
  "off the record","just between us","don’t quote me on this","dont quote me on this",
149
  "we never had this conversation","keep this between us","not ethical","illegal",
150
  "grey area","gray area","write off","failed investment","they owe it to me",
 
151
  "let’s take this offline","lets take this offline","send to my gmail","send to my yahoo",
152
  "don’t leave a trail","dont leave a trail","call my cell","text me","don’t text me","dont text me",
153
  "tell you on the phone","talk in person","come by my office","vpn",
 
154
  "tax haven","off-shore account","offshore account","backdate","pull earnings forward",
155
  "delete this email","no inspection","special fees","wire instructions",
156
  ]
 
157
  EVASIVE_ACRO_RE = re.compile(r'\b(?:TYOP|LDL|TOL|OTR|TXT|TYL)\b', re.I)
158
 
 
159
  MONEY_RE = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
160
  PHONE_RE = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
161
  INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order|wire)\b', re.I)
162
  COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
163
  ATTACH_NAME_RE = re.compile(r'\b(agreement|contract|invoice|wire|payment|instructions|accounts?|offshore|tax|statement)\b', re.I)
164
 
 
165
  OFFCHANNEL_PATTERNS = [
166
  r"\bwhatsapp\b", r"\bsignal\b", r"\btelegram\b", r"\bwechat\b",
167
  r"send to my (gmail|yahoo|protonmail)", r"(call|text) (me|my cell)",
 
170
  ]
171
  OFFCHANNEL_RE = re.compile("|".join(OFFCHANNEL_PATTERNS), re.I)
172
 
 
173
  PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
174
 
 
175
  NEWS_DOMAINS = {"nytimes.com","ft.com","wsj.com","bloomberg.com","reuters.com","theguardian.com","economist.com"}
176
  def is_news_like(subject: str, body: str, from_domain: str) -> bool:
177
  s = (subject or "").lower()
 
182
  if any(d in fd for d in NEWS_DOMAINS): return True
183
  return False
184
 
 
185
  NOTIFY_PATTERNS = [
186
  r"\bno[-\s]?reply\b", r"do not reply", r"security alert", r"new sign[-\s]?in",
187
  r"verification code", r"two[-\s]?factor", r"\b2fa\b", r"\botp\b", r"\bcode[:\s]",
188
  r"itunes connect", r"apple id", r"your google account", r"used (?:a )?new browser",
189
  r"unable to determine", r"reset your password", r"\balert\b",
 
190
  r"mailer[-\s]?daemon", r"\bpostmaster\b", r"delivery status notification",
191
  r"undeliverable", r"delivery failure", r"returned mail", r"mail delivery subsystem",
192
  r"proofpoint", r"mimecast", r"dmarc", r"\bspf\b", r"\bdkim\b", r"quarantine",
 
203
  return True
204
  return False
205
 
 
206
  HEB_RE = re.compile(r'[\u0590-\u05FF]')
207
  AR_RE = re.compile(r'[\u0600-\u06FF]')
208
  CYR_RE = re.compile(r'[\u0400-\u04FF]')
 
217
  return "en"
218
  return "unknown"
219
 
 
220
  CORR_LEX = {
221
  "kickback" : ["kickback","bribe","under the table","gift","cash"],
222
  "invoice_fraud" : ["false invoice","ghost employee","contract splitting","slush fund","shell company","front company"],
 
238
  "january","february","march","april","june","july","august","september",
239
  "october","november","december"
240
  }
 
 
241
  STOP_TERMS = {
242
  "div","span","nbsp","href","src","img","class","style","align","border","cid",
243
  "content","content-type","multipart","alternative","quoted","printable","utf",
244
  "windows-1255","iso-8859","us-ascii","html","plain","attachment","filename",
245
  "type","id","service","person","generated","fyi"
246
  }
 
 
247
  AUX_STOP = {
248
  "will","would","should","could","can","cant","cannot","did","do","does","done",
249
  "have","has","had","having","get","got","make","made","let","need","want",
 
264
  ZH_HEADER_STOP = {"发送时间","星期","星期一","星期二","星期三","星期四","星期五","星期六","星期日","转发","主题","收件人","发件人"}
265
  HE_EXTRA_STOP = {"עם","או"}
266
 
 
267
  STOP_TERMS |= AUX_STOP | CTA_STOP | TECH_META | ZH_HEADER_STOP | HE_EXTRA_STOP
268
  EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
269
  YEAR_RE = re.compile(r"^(19|20)\d{2}$")
270
  NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
271
  ONE_CHAR_RE = re.compile(r"^.$")
272
 
273
+ LONG_ALNUM_RE = re.compile(r"^[A-Za-z0-9_-]{24,}$")
274
+ HEXISH_RE = re.compile(r"^(?:[A-Fa-f0-9]{8,})$")
275
+ DIGIT_HEAVY_RE = re.compile(r"^(?:\D*\d){6,}\D*$")
 
276
  UNDERSCORE_HEAVY_RE = re.compile(r"^[A-Za-z0-9]*_[A-Za-z0-9_]*$")
277
 
 
278
  STOPWORD_FOR_VEC = sorted(EN_STOP | HE_STOP | STOP_TERMS)
279
 
280
  def _is_junk_term(t: str) -> bool:
 
339
  cut = idx if (cut is None or idx < cut) else cut
340
  if cut is not None:
341
  text = text[:cut]
 
342
  text = re.sub(r"\n\s*sent from my .*?$", "", text, flags=re.I|re.M)
343
  text = re.sub(r"\n\s*(נשלח מה-?iphone).*?$", "", text, flags=re.I|re.M)
344
  return text.strip()
 
411
  if str(raw.get("type", "")).lower() == "meta":
412
  return {}
413
 
 
414
  attach_names = []
415
  atts = raw.get("attachments") or raw.get("Attachments") or raw.get("files") or []
416
  if isinstance(atts, list):
 
461
 
462
  subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
463
 
 
464
  if use_langdetect:
465
  try:
466
  lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
 
532
  df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
533
  return df
534
 
 
535
  def _compile_highlight_terms(row: pd.Series, extra_terms: List[str]) -> List[str]:
536
  terms = []
537
  txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
 
538
  for p in SUSPECT_PHRASES:
539
  if p in txt:
540
  terms.append(p)
 
541
  if MONEY_RE.search(txt): terms.append("$")
542
  if INVOICE_RE.search(txt): terms.append("invoice")
 
543
  if PHONE_RE.search(row.get("body_text","") or ""): terms.append("phone")
 
544
  for t in extra_terms or []:
545
  t=t.strip()
546
  if t and t.lower() in txt:
547
  terms.append(t)
 
548
  out, seen = [], set()
549
  for t in terms:
550
  if t.lower() not in seen:
551
+ out.append(t); seen.add(t.lower())
 
552
  return out[:24]
553
 
554
  def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None,
 
566
  hl_terms = []
567
  if do_highlight:
568
  hl_terms = (query_terms or []) + _compile_highlight_terms(row, extra_terms or [])
 
569
  seen=set(); uniq=[]
570
  for t in hl_terms:
571
  tl=t.lower()
 
630
 
631
  # ---------- Lightweight Embedding Utilities (Optional) ----------
632
  def _load_embeddings(emb_path: str, binary: bool):
 
 
 
 
 
633
  if not GENSIM_OK or not emb_path or not os.path.exists(emb_path):
634
  return None, 0
635
  try:
 
639
  kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=False)
640
  return kv, int(kv.vector_size)
641
  except Exception:
 
642
  try:
643
  kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=True)
644
  return kv, int(kv.vector_size)
 
646
  return None, 0
647
 
648
  def _avg_embed_for_text(text: str, kv, dim: int) -> np.ndarray:
 
 
 
 
649
  vec = np.zeros((dim,), dtype=np.float32)
650
  if not kv or not text:
651
  return vec
 
657
  cnt += 1
658
  if cnt > 0:
659
  vec /= float(cnt)
 
660
  n = np.linalg.norm(vec)
661
  if n > 0:
662
  vec /= n
663
  return vec
664
 
665
  def _build_doc_embeddings(texts: List[str], kv, dim: int) -> np.ndarray:
 
 
 
666
  if not kv or dim <= 0:
667
  return np.zeros((len(texts), 0), dtype=np.float32)
668
  out = np.zeros((len(texts), dim), dtype=np.float32)
 
721
  topn=6,
722
  subject_alpha=0.75,
723
  global_ubiq_cut=0.20,
724
+ subject_min_cov=0.30
725
  ):
 
 
 
 
 
 
 
726
  import math as _math
727
  from collections import Counter, defaultdict
728
  from sklearn.feature_extraction.text import TfidfVectorizer
 
733
  def is_junk_token(tok: str) -> bool:
734
  if _is_junk_term(tok): return True
735
  tl = tok.lower()
736
+ if tl.startswith("__"): return True
737
  if "@" in tl: return True
738
  if tl.isascii() and len(tl) <= 2: return True
739
  if LONG_ALNUM_RE.match(tok) or HEXISH_RE.match(tok) or DIGIT_HEAVY_RE.match(tok): return True
740
  if len(tok) > 40: return True
 
741
  if re.search(r"[^\w\-’']", tl): return True
742
  return False
743
 
 
748
  def ngrams(toks, n):
749
  return [" ".join(p) for p in zip(*[toks[i:] for i in range(n)]) if all(not is_junk_token(x) for x in p)]
750
 
 
751
  glob_df_uni = Counter()
752
  glob_df_bg = Counter()
753
  glob_df_tri = Counter()
 
761
 
762
  have_subjects = subjects is not None and len(subjects) == len(texts)
763
 
 
764
  for idx, (txt, c) in enumerate(zip(texts, labels)):
765
  c = int(c)
766
  toks = tokenize_clean(txt)
767
  uni_set = set(toks)
768
  bg_set = set(ngrams(toks, 2))
769
  tri_set = set(ngrams(toks, 3))
 
770
  glob_df_uni.update(uni_set)
771
  glob_df_bg.update(bg_set)
772
  glob_df_tri.update(tri_set)
 
773
  per_c_bg[c].update(bg_set)
774
  per_c_tri[c].update(tri_set)
775
  per_c_texts[c].append(" ".join(toks))
776
  per_c_doc_count[c] += 1
 
777
  if have_subjects:
778
  stoks = tokenize_clean(subjects[idx] or "")
779
  s_uni = set(stoks)
 
784
  per_c_subj_tri_docs[c].update(s_tri)
785
 
786
  N = max(1, len(texts))
 
787
 
788
+ def too_ubiquitous(df_count):
 
789
  return (df_count / float(N)) > float(global_ubiq_cut)
790
 
791
+ labels_out = {}
792
  for c in sorted(set(int(x) for x in labels)):
793
  n_docs_c = max(1, per_c_doc_count[c])
 
 
794
  phrases = []
795
  for store, glob_df, subj_docs, n in (
796
  (per_c_bg[c], glob_df_bg, per_c_subj_bg_docs[c], 2),
 
800
  total_g = sum(glob_df.values()) + 1e-12
801
  scored = []
802
  for ng, cnt in store.most_common(3000):
803
+ if too_ubiquitous(glob_df[ng]):
804
  continue
805
  p_ng_c = cnt / total_c
806
  p_ng_g = (glob_df[ng] / total_g)
 
809
  cov = 0.0
810
  if have_subjects:
811
  cov = subj_docs[ng] / n_docs_c
 
812
  if cov >= subject_min_cov:
813
+ score += 0.6
814
  score += subject_alpha * cov
815
  scored.append((score, cov, ng))
 
816
  scored.sort(key=lambda x: (x[1] >= subject_min_cov, x[0]), reverse=True)
817
  take = max(1, topn // (3 if n == 3 else 2))
818
  phrases.extend([p for _, _, p in scored[:take]])
819
 
 
820
  docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
821
  docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
822
  corpus = [docs_c[0], docs_bg[0]]
 
833
  vocab_index = {t:i for i,t in enumerate(vocab)}
834
  if have_subjects:
835
  for tok, cnt_docs in per_c_subj_uni_docs[c].items():
836
+ if tok in vocab_index and not _is_junk_term(tok):
837
  i = vocab_index[tok]
838
  frac = cnt_docs / n_docs_c
839
  subj_cov[i] = frac
840
  subj_cov_frac[i] = frac
841
 
 
842
  row_boosted = row + subject_alpha * subj_cov
 
843
  pref_bump = (subj_cov_frac >= subject_min_cov).astype(row_boosted.dtype) * 0.6
844
  final = row_boosted + pref_bump
845
 
 
847
  unis = []
848
  for i in order:
849
  tok = vocab[i]
850
+ if _is_junk_term(tok): continue
851
+ if too_ubiquitous(glob_df_uni.get(tok, 0)):
852
  continue
853
  unis.append(tok)
854
  if len(unis) >= max(0, topn - len(phrases)):
 
862
  # =================== Auto-k & merge ===================
863
  def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
864
  n = X.shape[0]
 
865
  if n <= 1:
866
  return 1, {1: 0.0}
867
  if n < min(ks):
 
868
  k_small = max(2, min(10, n))
869
  return int(k_small), {int(k_small): 0.0}
870
 
 
877
  inertias = []
878
  for k in ks:
879
  k = int(k)
880
+ if n < k:
881
  break
882
  km = MiniBatchKMeans(n_clusters=k, batch_size=4096, random_state=0, n_init="auto")
883
  km.fit(Xs)
 
906
  while parent[a]!=a: a=parent[a]
907
  return a
908
  for i in range(k):
909
+ for j in range(i+1, k):
910
  if sim[i,j] >= thresh:
911
  pi, pj = find(i), find(j)
912
  if pi!=pj: parent[pj]=pi
 
949
  return seeds_red
950
  return None
951
 
952
+ # =================== NEW: cluster stabilizer ===================
953
  def _centroids_from_labels(X, labels):
954
  labs = np.asarray(labels, dtype=int)
955
  uniq = np.unique(labs)
 
963
  if n > 0: v = v / n
964
  cents[int(c)] = v.astype(np.float32)
965
  return cents
 
966
  X = X.tocsr()
967
  for c in uniq:
968
  rows = np.where(labs == c)[0]
 
978
  if not centroids:
979
  return None, None
980
  keys = list(centroids.keys())
981
+ C = np.stack([centroids[k] for k in keys], axis=0)
982
  if isinstance(vecs, np.ndarray):
983
  sims = vecs @ C.T
984
  else:
 
991
 
992
  def stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35):
993
  labs = np.asarray(labels, dtype=int)
 
 
994
  cents = _centroids_from_labels(X_space, labs)
995
  keys = sorted([k for k in cents.keys() if k >= 0])
996
  if len(keys) >= 2:
 
1012
  labs = np.array([merge_map.get(int(c), int(c)) for c in labs], dtype=int)
1013
  cents = _centroids_from_labels(X_space, labs)
1014
 
 
1015
  vc = pd.Series(labs).value_counts()
1016
  big_labs = set(vc[vc >= int(min_size)].index.tolist())
1017
  small_labs = set(vc[vc < int(min_size)].index.tolist())
 
1076
  df_in["context_anomaly_score"] = 0.0
1077
  return df_in
1078
 
 
1079
  df = df_in.copy()
1080
  if "anomaly_score" in df.columns:
 
1081
  df["_if_pct"] = 0.0
1082
  for bkt, grp in df.groupby("bucket", dropna=False):
1083
  vals = grp["anomaly_score"].astype(float)
1084
  if vals.notna().sum() >= 5:
1085
+ ranks = vals.rank(pct=True, ascending=False)
1086
  df.loc[grp.index, "_if_pct"] = ranks.clip(0, 1)
1087
  df["_if_pts"] = (df["_if_pct"] * 6.0).clip(0, 6)
1088
  else:
1089
  df["_if_pts"] = 0.0
1090
 
 
1091
  df["_rule_pts"] = 0.0
1092
  low = (df["subject"].fillna("") + " " + df["body_text"].fillna("")).str.lower()
1093
  for bkt, terms in TAXONOMY.items():
1094
  mask = (df["bucket"] == bkt)
1095
+ if not mask.any():
1096
  continue
1097
  if terms:
1098
  has_term = low.str.contains("|".join([re.escape(t.lower()) for t in terms]), regex=True)
1099
  df.loc[mask & (~has_term), "_rule_pts"] += 1.0
 
1100
  if bkt == "Constituent":
1101
  df.loc[mask & (~df["from_domain"].str.lower().isin(PERSONAL_DOMAINS)), "_rule_pts"] += 1.0
1102
  if bkt == "Scheduling":
 
1104
  df.loc[mask & (~subj.str.contains(r"\bmeeting|invite|schedule|calendar\b", regex=True)), "_rule_pts"] += 1.0
1105
 
1106
  df["_rule_pts"] = df["_rule_pts"].clip(0, 2)
 
 
1107
  df["_corr_pts"] = df["corruption_score"].fillna(0).clip(0, 3)
1108
 
1109
  df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
 
1437
 
1438
  with gr.Row():
1439
  run_btn = gr.Button("Process", variant="primary")
1440
+ # NEW: Update button lets you re-run with same uploaded file & current settings
1441
+ update_btn = gr.Button("Update", variant="secondary") # NEW: Update
1442
  reset_btn = gr.Button("Reset filters")
1443
  status = gr.Markdown("")
1444
 
 
1478
  state_dims = gr.State()
1479
  state_extra_terms = gr.State()
1480
  state_highlight = gr.State()
1481
+ state_inbox = gr.State() # NEW: keep last uploaded file for Update
1482
 
1483
+ # -------- IO helpers --------
1484
  def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
1485
  recs: List[Dict[str, Any]] = []
1486
  if local_path.endswith(".jsonl"):
 
1863
  extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
1864
  extra_terms_lower = [t.lower() for t in extra_terms]
1865
 
1866
+ # Handle Gradio file object
1867
+ try:
1868
+ infile_path = inbox_file.name
1869
+ except Exception:
1870
+ infile_path = str(inbox_file) if inbox_file else ""
1871
+
1872
+ recs = _load_json_records(infile_path)
1873
  if not recs:
1874
  return ("**No valid records found.**",
1875
  None, None, None, None, None,
 
2158
  state_extra_terms, state_highlight,
2159
  bucket_drop,
2160
  ],
2161
+ ).then(
2162
+ # remember the uploaded file for future "Update" runs
2163
+ lambda f: f, inputs=[inbox_file], outputs=[state_inbox]
2164
+ )
2165
+
2166
+ # Keep state_inbox in sync whenever a new file is uploaded
2167
+ inbox_file.change(lambda f: f, inputs=[inbox_file], outputs=[state_inbox])
2168
+
2169
+ # NEW: Bind Update button — re-run with the last uploaded file + current settings
2170
+ update_btn.click(
2171
+ process_file,
2172
+ inputs=[
2173
+ state_inbox, max_features, min_df, max_df, use_bigrams, skip_lang,
2174
+ use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
2175
+ trusted_domains_in, extra_keywords_in, highlight_toggle,
2176
+ use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
2177
+ per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
2178
+ watchlist_in, min_mentions
2179
+ ],
2180
+ outputs=[
2181
+ status, cluster_counts_df, domain_counts_df, sender_counts_df,
2182
+ actors_df, offhours_df,
2183
+ surv_entities_df, surv_samples_df,
2184
+ results_df,
2185
+ state_df, state_vec, state_X_reduced,
2186
+ state_index, state_term_names,
2187
+ state_use_lsa, state_use_faiss,
2188
+ cluster_drop, domain_drop, sender_drop, lang_drop,
2189
+ state_svd, state_norm, state_dims,
2190
+ state_extra_terms, state_highlight,
2191
+ bucket_drop,
2192
+ ],
2193
  )
2194
 
2195
  # -------- Filtering & Search --------
 
2359
  if __name__ == "__main__":
2360
  # Disable SSR to avoid handler arity warnings under server-side rendering
2361
  demo.launch(ssr_mode=False)