wuhp commited on
Commit
d4eefe7
·
verified ·
1 Parent(s): d5cce46

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +383 -135
app.py CHANGED
@@ -25,6 +25,13 @@ from sklearn.preprocessing import Normalizer
25
  from sklearn.preprocessing import normalize as sk_normalize
26
  from sklearn.metrics.pairwise import cosine_similarity
27
 
 
 
 
 
 
 
 
28
  from scipy.sparse import hstack
29
 
30
  # Optional fast ANN (CPU)
@@ -34,7 +41,7 @@ try:
34
  except Exception:
35
  FAISS_OK = False
36
 
37
- # Optional, but strongly recommended (tiny + fast)
38
  try:
39
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
40
  VADER_OK = True
@@ -42,44 +49,59 @@ except Exception:
42
  VADER_OK = False
43
 
44
  # =================== Regex & Flags ===================
45
- # Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
46
  TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
47
-
48
- # URLs -> "URL" (reduce feature bloat).
49
  URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
50
-
51
- # Quote lines ("> ...")
52
  QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
53
-
54
- # Signature separator: lines after "-- " (standard)
55
  SIG_RE = re.compile(r"\n-- ?\n", re.M)
56
-
57
- # Device footers
58
  SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
59
  HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
60
-
61
- # Forward/quoted markers
62
  FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
63
  FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
64
  ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
65
 
66
- # Toggle for language detection (skip for speed)
67
  SKIP_LANGDETECT = True
68
 
69
- # Corruption keyword/phrase list (you can extend freely)
70
  SUSPECT_PHRASES = [
71
- "off the books", "cover up", "kickback", "bribe", "under the table",
72
- "no inspection", "special fee", "friendly payment", "confidential deal",
73
- "nobody will find out", "pay to play", "cash only", "shell company",
74
- "bid rigging", "embezzle", "slush fund", "false invoice", "ghost employee",
75
- "contract splitting", "grease payment", "unreported", "unrecorded",
 
 
 
 
 
 
 
 
 
 
 
 
76
  ]
 
 
77
 
78
- # Entity regexes for enrichment/scoring
79
  MONEY_RE = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
80
  PHONE_RE = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
81
- INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order)\b', re.I)
82
  COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  # Optional seeded themes for semi-supervised init (used only when LSA is ON)
85
  CORR_LEX = {
@@ -89,19 +111,15 @@ CORR_LEX = {
89
  "money_flow" : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
90
  }
91
 
92
- # =================== Label cleanup helpers ===================
93
  EN_STOP = {
94
  "the","of","and","to","in","is","for","on","at","with","from","by","or","as",
95
  "that","this","it","be","are","was","were","an","a","you","your","we","our","us",
96
  "re","fwd","fw","hi","hello","thanks","thank","regards","best","please","dear","mr","mrs",
97
  "message","original","forwarded","attached","attachment","confidential","notice","disclaimer",
98
- "herein","thereof","hereby","therein","regarding","subject","url","via","kind","regard",
99
- "ny"
100
- }
101
- HE_STOP = {
102
- "של","על","זה","גם","אם","לא","את","אתה","אני","הוא","היא","הם","הן","כי","מה",
103
- "שלום","תודה","בברכה","מצורף","הודעה","קדימה","היי"
104
  }
 
105
  MONTHS = {
106
  "jan","feb","mar","apr","may","jun","jul","aug","sep","sept","oct","nov","dec",
107
  "january","february","march","april","june","july","august","september",
@@ -116,14 +134,10 @@ def _is_junk_term(t: str) -> bool:
116
  tl = t.lower()
117
  if tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
118
  return True
119
- if EMAIL_LIKE_RE.search(tl):
120
- return True
121
- if YEAR_RE.match(tl):
122
- return True
123
- if NUMERIC_RE.match(tl):
124
- return True
125
- if ONE_CHAR_RE.match(tl):
126
- return True
127
  return False
128
 
129
  def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
@@ -147,7 +161,7 @@ def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarra
147
  break
148
  return cleaned
149
 
150
- # =================== HTML/Text Cleanup ===================
151
  def html_to_text(html: str) -> str:
152
  if not html:
153
  return ""
@@ -183,10 +197,19 @@ def parse_name_email(s: str) -> Tuple[str, str]:
183
  return (m.group(1) or "").strip(), (m.group(2) or "").strip()
184
  return "", s.strip()
185
 
 
 
 
 
 
 
 
 
 
186
  def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
187
  headers: Dict[str, str] = {}
188
  lines = (text or "").splitlines()
189
- header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject):')
190
  i = 0
191
  saw_header = False
192
  while i < len(lines):
@@ -234,15 +257,27 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
234
  if str(raw.get("type", "")).lower() == "meta":
235
  return {}
236
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  body_text_raw = raw.get("body_text") or raw.get("text") or ""
238
  html_content = raw.get("body_html") or raw.get("html") or ""
239
  if html_content and not body_text_raw:
240
  body_text_raw = html_to_text(html_content)
241
-
242
  body_text_raw = ftfy.fix_text(body_text_raw or "")
243
 
244
  subject_text = ""
245
  from_name = from_email = from_domain = ""
 
246
  date_val = raw.get("date") or raw.get("Date") or ""
247
 
248
  if body_text_raw:
@@ -250,6 +285,8 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
250
  subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
251
  sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
252
  date_val = headers.get("Date", "") or date_val
 
 
253
 
254
  body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
255
  body_clean = URL_RE.sub(" URL ", body_clean)
@@ -267,6 +304,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
267
  sender = raw.get("from") or raw.get("From") or ""
268
  from_name, from_email = parse_name_email(sender)
269
  from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
 
270
 
271
  subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
272
 
@@ -302,9 +340,11 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
302
  "from_name": from_name,
303
  "from_email": from_email,
304
  "from_domain": from_domain,
 
305
  "subject": subject_norm,
306
  "body_text": body_text,
307
  "lang": lang,
 
308
  "text_hash": text_hash,
309
  }
310
 
@@ -322,6 +362,8 @@ def has_suspect_tag(text: str) -> List[str]:
322
  if "wire" in low or "transfer" in low or "cash" in low:
323
  if "finance" not in tags:
324
  tags.append("finance")
 
 
325
  return tags
326
 
327
  def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
@@ -337,19 +379,60 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
337
  df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
338
  return df
339
 
340
- def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str] = None) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  subject = (row.get("subject") or "").strip()
342
  body = (row.get("body_text") or "").strip()
343
  from_email = row.get("from_email") or ""
344
  date = row.get("date") or ""
345
  tags = row.get("tags") or []
 
346
  sentiment = row.get("sentiment") or "(unknown)"
347
 
 
 
 
 
 
 
 
 
 
 
 
348
  def hi(text: str) -> str:
349
- if not text or not query_terms:
350
  return text
351
  out = text
352
- for qt in query_terms:
353
  if not qt:
354
  continue
355
  try:
@@ -366,9 +449,17 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
366
  dir_attr = ' dir="rtl"' if rtl else ""
367
  body_html = body_h.replace("\n", "<br/>")
368
 
 
 
 
369
  tag_html = ""
 
370
  if isinstance(tags, list) and tags:
371
- tag_html = " ".join([f'<span class="tag">{t}</span>' for t in tags])
 
 
 
 
372
 
373
  cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
374
 
@@ -401,9 +492,7 @@ class BM25Transformer:
401
  self.avgdl_ = None
402
 
403
  def fit(self, X):
404
- # X is term-frequency (CountVectorizer)
405
  N = X.shape[0]
406
- # document frequency per term
407
  df = np.bincount(X.tocsc().indices, minlength=X.shape[1]).astype(np.float64)
408
  self.idf_ = np.log((N - df + 0.5) / (df + 0.5 + 1e-12))
409
  dl = np.asarray(X.sum(axis=1)).ravel()
@@ -423,7 +512,6 @@ class BM25Transformer:
423
  data[i] = (self.idf_[cols[i]] * (tf * (k1 + 1))) / (denom + 1e-12)
424
  return X
425
 
426
- # Add enrichment tokens to help the model lock onto key signals
427
  def enrich_text(row: pd.Series) -> str:
428
  subj = row.get("subject","") or ""
429
  body = row.get("body_text","") or ""
@@ -433,6 +521,7 @@ def enrich_text(row: pd.Series) -> str:
433
  if PHONE_RE.search(t): tokens.append("__HAS_PHONE__")
434
  if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
435
  if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
 
436
  return (t + " " + " ".join(tokens)).strip()
437
 
438
  # =================== Cluster labeling: PMI bigrams ===================
@@ -466,7 +555,7 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
466
  labels_out[c] = ", ".join(top) if top else f"cluster_{c}"
467
  return labels_out
468
 
469
- # =================== Auto-k (Kneedle on inertia) ===================
470
  def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
471
  n = X.shape[0]
472
  if n > 40000:
@@ -490,10 +579,8 @@ def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
490
  return k_best, dict(zip(ks, inertias))
491
 
492
  def auto_k_rule(n_docs: int) -> int:
493
- # Sublinear scaling; keeps clusters between ~120 and 600 for big corpora
494
  return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
495
 
496
- # =================== Merge close clusters (LSA space only to save RAM) ===================
497
  def merge_close_clusters(labels, centers, thresh=0.92):
498
  centers = sk_normalize(centers)
499
  sim = cosine_similarity(centers, centers)
@@ -517,11 +604,9 @@ def merge_close_clusters(labels, centers, thresh=0.92):
517
  labels2 = np.array([idmap[root[int(c)]] for c in labels], dtype=int)
518
  return labels2
519
 
520
- # =================== Seeded centroids (only if LSA enabled) ===================
521
  def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVectorizer,
522
  lsa_components: np.ndarray, norm_obj: Normalizer,
523
  d_word: int, d_full: int, k: int) -> Optional[np.ndarray]:
524
- # Build a few unit vectors in word-term space based on lexicons
525
  seeds_word = []
526
  vocab = count_vec.vocabulary_
527
  for _, words in lexicons.items():
@@ -536,30 +621,45 @@ def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVect
536
  seeds_word.append(v)
537
  if not seeds_word:
538
  return None
539
- # Lift to full feature space (word + char) by padding zeros for char dims
540
  seeds_full = []
541
  for v in seeds_word:
542
  vf = np.zeros((d_full,), dtype=np.float32)
543
  vf[:d_word] = v
544
  seeds_full.append(vf)
545
- seeds_full = np.stack(seeds_full, axis=0) # (s, n_features)
546
- # Project to LSA space: x @ components_.T then normalize
547
- seeds_red = seeds_full @ lsa_components.T # (s, lsa_dim)
548
  seeds_red = norm_obj.transform(seeds_red.astype(np.float32))
549
- # If fewer than k seeds, KMeans will accept; scikit-learn requires init shape == (k, d)
550
- # We’ll return only if seeds count >= 2 to be meaningful; otherwise None
551
  if seeds_red.shape[0] >= 2 and seeds_red.shape[0] <= k:
552
  return seeds_red
553
  return None
554
 
555
- # =================== Corruption scoring ===================
556
- def corruption_score(row):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
  score = 0.0
558
  txt = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
559
  for ph in SUSPECT_PHRASES:
560
  if ph in txt:
561
  score += 2.0
562
  break
 
 
563
  if isinstance(row.get("tags"), list) and ("🚩suspect" in row["tags"] or "finance" in row["tags"]):
564
  score += 1.5
565
  if MONEY_RE.search(txt): score += 0.7
@@ -569,6 +669,14 @@ def corruption_score(row):
569
  body_len = len(row.get("body_text",""))
570
  if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
571
  score += 0.5
 
 
 
 
 
 
 
 
572
  return score
573
 
574
  # =================== Gradio UI ===================
@@ -587,13 +695,11 @@ CSS = """
587
  mark { background:#fff59d; color:#111827; padding:0 2px; border-radius:2px; }
588
  hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
589
  .small { color:#475569; font-size:12px; }
 
590
  """
591
 
592
  with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
593
- gr.Markdown("""
594
- # Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans
595
- **Goal:** quickly surface potentially corruption-related emails via topic clusters, tags, corruption score, and sentiment.
596
- """)
597
 
598
  with gr.Row():
599
  inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
@@ -613,21 +719,25 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
613
  mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
614
  with gr.Row():
615
  use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
 
616
 
617
- with gr.Accordion("Filters", open=True):
618
  with gr.Row():
619
- cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
620
- domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
621
- sentiment_drop = gr.Dropdown(
622
- label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)"
623
- )
624
  with gr.Row():
625
- tag_drop = gr.Dropdown(
626
- label="Tag", choices=["(any)", "🚩suspect", "finance"], value="(any)"
627
- )
 
 
 
628
  with gr.Row():
629
  date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
630
  date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
 
 
631
 
632
  with gr.Row():
633
  run_btn = gr.Button("Process", variant="primary")
@@ -635,9 +745,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
635
  status = gr.Markdown("")
636
 
637
  with gr.Row():
638
- cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
639
  domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
640
 
 
 
 
 
641
  gr.Markdown("### Search")
642
  with gr.Row():
643
  search_query = gr.Textbox(label="Search (keywords, names, etc.)")
@@ -646,17 +760,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
646
  email_view = gr.HTML(label="Reader")
647
 
648
  # State
649
- state_df = gr.State() # full dataframe
650
- state_vec = gr.State() # {"count_vec":..., "char_vec":..., "bm25":...}
651
- state_X_reduced = gr.State() # np.ndarray (LSA normalized) or None
652
- state_index = gr.State() # Faiss index or sklearn NN
653
- state_term_names = gr.State() # dict cluster_id -> label
654
- state_query_terms = gr.State() # last search terms list
655
  state_use_lsa = gr.State()
656
  state_use_faiss = gr.State()
657
  state_svd = gr.State()
658
  state_norm = gr.State()
659
- state_dims = gr.State() # (d_word, d_char)
 
 
660
 
661
  # -------- IO helpers --------
662
  def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
@@ -691,6 +807,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
691
  df: pd.DataFrame,
692
  cluster: Optional[str],
693
  domain: Optional[str],
 
 
694
  sentiment: str,
695
  tag_value: str,
696
  start: str,
@@ -704,10 +822,15 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
704
  out = out[out["cluster_id"] == cid]
705
  if domain and domain != "(any)":
706
  out = out[out["from_domain"] == domain]
 
 
 
 
707
  if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
708
  out = out[out["sentiment"].astype(str) == sentiment]
709
  if tag_value and tag_value != "(any)":
710
- out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
 
711
  if start:
712
  try:
713
  dt = pd.to_datetime(start, utc=True, errors="coerce")
@@ -722,19 +845,47 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
722
  pass
723
  return out
724
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
725
  # -------- Main pipeline --------
726
  def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
727
- use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss):
 
728
  if inbox_file is None:
729
  return ("**Please upload a file.**",
730
- None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
 
731
 
732
  use_lang = not bool(skip_lang)
733
 
 
 
 
 
 
 
734
  recs = _load_json_records(inbox_file.name)
735
  if not recs:
736
  return ("**No valid records found.**",
737
- None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
 
738
 
739
  # Normalize
740
  normd = []
@@ -745,19 +896,35 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
745
  df = pd.DataFrame(normd)
746
  if df.empty:
747
  return ("**No usable email records after normalization.**",
748
- None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
 
749
 
750
  # Deduplicate conservatively
751
  df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
752
 
753
- # Tags (suspect/finance) + Sentiment
754
  df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
755
  df = compute_sentiment_column(df)
756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
  # Enriched texts (adds __HAS_*__ flags)
758
  texts = list(df.apply(enrich_text, axis=1))
759
 
760
- # === Vectorization: BM25 word + char tf-idf, then optional LSA ===
761
  ngram_range = (1, 2) if use_bigrams else (1, 1)
762
  count_vec = CountVectorizer(
763
  analyzer="word",
@@ -784,42 +951,49 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
784
  d_char = X_char.shape[1]
785
  d_full = X_full.shape[1]
786
 
787
- # LSA (TruncatedSVD + Normalizer) for stability/quality
788
  use_lsa = bool(use_lsa)
789
  X_reduced = None
790
  svd_obj = None
791
  norm_obj = None
792
  if use_lsa:
793
  svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
794
- X_reduced_tmp = svd_obj.fit_transform(X_full) # dense (n_docs x lsa_dim)
795
  norm_obj = Normalizer(copy=False)
796
  X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
797
  del X_reduced_tmp
798
  gc.collect()
799
 
 
 
 
 
 
 
 
 
 
 
 
 
800
  # K selection
801
  if bool(auto_k):
802
  if use_lsa:
803
  k, _ = choose_k_by_kneedle(X_reduced, ks=(50,100,150,200,300,400,500))
804
  else:
805
- # fallback: heuristic rule on doc count
806
  k = auto_k_rule(X_full.shape[0])
807
  else:
808
  k = max(10, int(k_clusters or 350))
809
 
810
- # Optional seeded init (only in LSA space to keep memory sane)
811
  init = None
812
  if use_lsa:
813
  seeds = seeded_centroids_in_lsa(
814
  CORR_LEX, count_vec, svd_obj.components_, norm_obj,
815
  d_word=d_word, d_full=d_full, k=k
816
  )
817
- if seeds is not None and seeds.shape[0] <= k:
818
- # If fewer seeds than k, KMeans will handle by k-means++ for remaining centers internally only for KMeans.
819
- # For MiniBatchKMeans, we must provide exactly k centers or fall back to k-means++.
820
- # So use seeds only if seeds.shape[0] == k; otherwise None.
821
- if seeds.shape[0] == k:
822
- init = seeds
823
 
824
  # KMeans clustering (use LSA space if enabled)
825
  X_space = (X_reduced if use_lsa else X_full)
@@ -832,18 +1006,18 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
832
  )
833
  labels = kmeans.fit_predict(X_space)
834
 
835
- # Optional: merge very-similar clusters (only when LSA enabled)
836
  if use_lsa:
837
  labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.92)
838
 
839
  df["cluster_id"] = labels
840
 
841
- # Name clusters by PMI bigrams on raw enriched texts
842
  term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
843
  df["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
844
 
845
- # CorruptionScore
846
- df["corruption_score"] = df.apply(corruption_score, axis=1)
847
 
848
  # Build search index
849
  use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
@@ -877,15 +1051,33 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
877
  )
878
  domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
879
 
880
- # Results preview default: rank by corruption_score then date desc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
881
  show_df = df.copy()
882
  if "date" in show_df.columns and show_df["date"].notna().any():
883
  show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
884
  else:
885
  show_df["_dt"] = pd.NaT
886
  show_df = show_df.sort_values(["corruption_score","_dt"], ascending=[False, False]).drop(columns=["_dt"])
887
-
888
- cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment", "corruption_score"]
889
  out_table = show_df[cols_out].head(500)
890
 
891
  vec_state = {"count_vec": count_vec, "char_vec": char_vec, "bm25": bm25}
@@ -894,66 +1086,91 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
894
  f"**Processed {len(df):,} emails** \n"
895
  f"Word feats (BM25): {d_word:,} | Char feats: {d_char:,} | Total: {d_full:,} \n"
896
  f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
897
- f"k = {k} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'}"
 
898
  )
899
 
900
  gc.collect()
901
 
902
  cluster_update = gr.update(choices=cluster_choices, value="(any)")
903
  domain_update = gr.update(choices=domain_choices, value="(any)")
 
 
904
 
905
  return (
906
  status_md,
907
  cluster_counts, domain_counts,
 
908
  out_table,
909
  df, vec_state, (X_reduced if use_lsa else None), index_obj, term_names,
910
  use_lsa, bool(use_faiss),
911
- cluster_update, domain_update,
912
  svd_obj, norm_obj,
913
- (d_word, d_char)
 
914
  )
915
 
916
  (run_btn.click)(
917
  process_file,
918
  inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
919
- use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss],
 
920
  outputs=[status,
921
  cluster_counts_df, domain_counts_df,
 
922
  results_df,
923
  state_df, state_vec, state_X_reduced, state_index, state_term_names,
924
  state_use_lsa, state_use_faiss,
925
- cluster_drop, domain_drop,
926
  state_svd, state_norm,
927
- state_dims]
 
928
  )
929
 
930
  # -------- Filtering & Search --------
931
- def refresh_results(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end):
932
- if df is None or len(df) == 0:
933
  return pd.DataFrame()
934
- filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
935
- cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment", "corruption_score"]
936
- if "date" in filt.columns and filt["date"].notna().any():
937
- tmp = filt.copy()
938
  tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
939
- tmp = tmp.sort_values(["corruption_score","_dt"], ascending=[False, False]).drop(columns=["_dt"])
940
- return tmp[cols_out].head(500)
941
- return filt.sort_values(["corruption_score"], ascending=False)[cols_out].head(500)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
942
 
943
- for ctrl in [cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]:
944
  ctrl.change(
945
  refresh_results,
946
- inputs=[state_df, cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end],
947
  outputs=[results_df]
948
  )
949
 
950
  reset_btn.click(
951
- lambda: [None, None, "(any)", "(any)", "", ""],
952
  inputs=[],
953
- outputs=[cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]
954
  ).then(
955
  refresh_results,
956
- inputs=[state_df, cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end],
957
  outputs=[results_df]
958
  )
959
 
@@ -986,7 +1203,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
986
  q_full = hstack([q_word, q_char], format="csr")
987
  return q_full
988
 
989
- def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj):
990
  if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
991
  return pd.DataFrame(), []
992
  q_terms = _tokenize_query(q)
@@ -1003,31 +1220,41 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1003
  inds = indices[0]
1004
  sims = 1.0 - distances[0]
1005
  results = df.iloc[inds].copy()
1006
- results["score"] = sims
1007
  elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
1008
  D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df)))
1009
  inds = I[0]
1010
  sims = D[0]
1011
  results = df.iloc[inds].copy()
1012
- results["score"] = sims
1013
  else:
1014
  return pd.DataFrame(), q_terms
1015
 
1016
- cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment", "corruption_score", "score"]
1017
- # Rerank by a blend: 0.7 * ANN score + 0.3 * corruption_score (scaled)
1018
  cs = results["corruption_score"].fillna(0.0)
1019
  cs = (cs - cs.min()) / (cs.max() - cs.min() + 1e-9)
1020
- results["_blend"] = 0.7*results["score"].values + 0.3*cs.values
1021
- results = results.sort_values("_blend", ascending=False).drop(columns=["_blend"])
 
 
 
 
 
 
 
 
 
 
1022
  return results[cols].head(50), q_terms
1023
 
1024
  search_btn.click(
1025
  search_fn,
1026
- inputs=[search_query, state_df, state_vec, state_X_reduced, state_index, state_use_lsa, state_use_faiss, state_svd, state_norm],
1027
  outputs=[results_df, state_query_terms]
1028
  )
1029
 
1030
- def on_row_select(evt: gr.SelectData, table: pd.DataFrame, df: pd.DataFrame, term_names: Dict[int, str], query_terms: Optional[List[str]]):
 
1031
  try:
1032
  row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
1033
  except Exception:
@@ -1052,13 +1279,34 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1052
  row = cand.iloc[0]
1053
  cid = int(row.get("cluster_id", -1))
1054
  clabel = term_names.get(cid, f"cluster_{cid}") if term_names else None
1055
- return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel)
1056
 
1057
  results_df.select(
1058
  on_row_select,
1059
- inputs=[results_df, state_df, state_term_names, state_query_terms],
1060
  outputs=[email_view]
1061
  )
1062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1063
  if __name__ == "__main__":
1064
  demo.launch()
 
25
  from sklearn.preprocessing import normalize as sk_normalize
26
  from sklearn.metrics.pairwise import cosine_similarity
27
 
28
+ # Optional light anomaly detection
29
+ try:
30
+ from sklearn.ensemble import IsolationForest
31
+ ISO_OK = True
32
+ except Exception:
33
+ ISO_OK = False
34
+
35
  from scipy.sparse import hstack
36
 
37
  # Optional fast ANN (CPU)
 
41
  except Exception:
42
  FAISS_OK = False
43
 
44
+ # Optional tiny sentiment
45
  try:
46
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
47
  VADER_OK = True
 
49
  VADER_OK = False
50
 
51
  # =================== Regex & Flags ===================
 
52
  TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
 
 
53
  URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
 
 
54
  QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
 
 
55
  SIG_RE = re.compile(r"\n-- ?\n", re.M)
 
 
56
  SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
57
  HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
 
 
58
  FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
59
  FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
60
  ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
61
 
 
62
  SKIP_LANGDETECT = True
63
 
64
+ # ==== Expanded corruption lexicon ====
65
  SUSPECT_PHRASES = [
66
+ # core corruption/finance
67
+ "off the books","cover up","kickback","bribe","under the table",
68
+ "no inspection","special fee","friendly payment","confidential deal",
69
+ "nobody will find out","pay to play","cash only","shell company",
70
+ "bid rigging","embezzle","slush fund","false invoice","ghost employee",
71
+ "contract splitting","grease payment","unreported","unrecorded",
72
+ # secrecy/evasion
73
+ "off the record","just between us","don’t quote me on this","dont quote me on this",
74
+ "we never had this conversation","keep this between us","not ethical","illegal",
75
+ "grey area","gray area","write off","failed investment","they owe it to me",
76
+ # off-channel comms
77
+ "let’s take this offline","lets take this offline","send to my gmail","send to my yahoo",
78
+ "don’t leave a trail","dont leave a trail","call my cell","text me","don’t text me","dont text me",
79
+ "tell you on the phone","talk in person","come by my office","vpn",
80
+ # financial secrecy & accounting games
81
+ "tax haven","off-shore account","offshore account","backdate","pull earnings forward",
82
+ "delete this email","no inspection","special fees","wire instructions",
83
  ]
84
+ # Evasive acronyms / slang (case-insensitive)
85
+ EVASIVE_ACRO_RE = re.compile(r'\b(?:TYOP|LDL|TOL|OTR|TXT|TYL)\b', re.I)
86
 
87
+ # Entity regexes
88
  MONEY_RE = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
89
  PHONE_RE = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
90
+ INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order|wire)\b', re.I)
91
  COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
92
+ ATTACH_NAME_RE = re.compile(r'\b(agreement|contract|invoice|wire|payment|instructions|accounts?|offshore|tax|statement)\b', re.I)
93
+
94
+ # Off-channel patterns (apps / phrases)
95
+ OFFCHANNEL_PATTERNS = [
96
+ r"\bwhatsapp\b", r"\bsignal\b", r"\btelegram\b", r"\bwechat\b",
97
+ r"send to my (gmail|yahoo|protonmail)", r"(call|text) (me|my cell)",
98
+ r"take this offline", r"don.?t (text|email) (me|this)",
99
+ r"\bOTR\b", r"\bTOL\b", r"\bTYOP\b", r"\bLDL\b"
100
+ ]
101
+ OFFCHANNEL_RE = re.compile("|".join(OFFCHANNEL_PATTERNS), re.I)
102
+
103
+ # Common personal mail domains (used with user-specified trusted org domains)
104
+ PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
105
 
106
  # Optional seeded themes for semi-supervised init (used only when LSA is ON)
107
  CORR_LEX = {
 
111
  "money_flow" : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
112
  }
113
 
114
+ # =================== Label cleanup helpers (unchanged core) ===================
115
  EN_STOP = {
116
  "the","of","and","to","in","is","for","on","at","with","from","by","or","as",
117
  "that","this","it","be","are","was","were","an","a","you","your","we","our","us",
118
  "re","fwd","fw","hi","hello","thanks","thank","regards","best","please","dear","mr","mrs",
119
  "message","original","forwarded","attached","attachment","confidential","notice","disclaimer",
120
+ "herein","thereof","hereby","therein","regarding","subject","url","via","kind","regard","ny"
 
 
 
 
 
121
  }
122
+ HE_STOP = {"של","על","זה","גם","אם","לא","את","אתה","אני","הוא","היא","הם","הן","כי","מה","שלום","תודה","בברכה","מצורף","הודעה","קדימה","היי"}
123
  MONTHS = {
124
  "jan","feb","mar","apr","may","jun","jul","aug","sep","sept","oct","nov","dec",
125
  "january","february","march","april","june","july","august","september",
 
134
  tl = t.lower()
135
  if tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
136
  return True
137
+ if EMAIL_LIKE_RE.search(tl): return True
138
+ if YEAR_RE.match(tl): return True
139
+ if NUMERIC_RE.match(tl): return True
140
+ if ONE_CHAR_RE.match(tl): return True
 
 
 
 
141
  return False
142
 
143
  def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
 
161
  break
162
  return cleaned
163
 
164
+ # =================== HTML/Text & Header Parsing ===================
165
  def html_to_text(html: str) -> str:
166
  if not html:
167
  return ""
 
197
  return (m.group(1) or "").strip(), (m.group(2) or "").strip()
198
  return "", s.strip()
199
 
200
+ def parse_multi_emails(s: str) -> List[str]:
201
+ if not s: return []
202
+ parts = re.split(r",\s*(?=[^,]*@)", s)
203
+ emails = []
204
+ for p in parts:
205
+ _, e = parse_name_email(p.strip())
206
+ if e: emails.append(e)
207
+ return emails
208
+
209
  def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
210
  headers: Dict[str, str] = {}
211
  lines = (text or "").splitlines()
212
+ header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject|Subject:|To:|Cc:|Bcc:|From:):')
213
  i = 0
214
  saw_header = False
215
  while i < len(lines):
 
257
  if str(raw.get("type", "")).lower() == "meta":
258
  return {}
259
 
260
+ # attachments (names); accept common schemas
261
+ attach_names = []
262
+ atts = raw.get("attachments") or raw.get("Attachments") or raw.get("files") or []
263
+ if isinstance(atts, list):
264
+ for a in atts:
265
+ if isinstance(a, dict):
266
+ name = a.get("filename") or a.get("name") or ""
267
+ else:
268
+ name = str(a)
269
+ if name:
270
+ attach_names.append(str(name))
271
+
272
  body_text_raw = raw.get("body_text") or raw.get("text") or ""
273
  html_content = raw.get("body_html") or raw.get("html") or ""
274
  if html_content and not body_text_raw:
275
  body_text_raw = html_to_text(html_content)
 
276
  body_text_raw = ftfy.fix_text(body_text_raw or "")
277
 
278
  subject_text = ""
279
  from_name = from_email = from_domain = ""
280
+ to_emails: List[str] = []
281
  date_val = raw.get("date") or raw.get("Date") or ""
282
 
283
  if body_text_raw:
 
285
  subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
286
  sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
287
  date_val = headers.get("Date", "") or date_val
288
+ to_emails = parse_multi_emails(headers.get("To","") or (raw.get("to") or "")) + \
289
+ parse_multi_emails(headers.get("Cc","") or (raw.get("cc") or ""))
290
 
291
  body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
292
  body_clean = URL_RE.sub(" URL ", body_clean)
 
304
  sender = raw.get("from") or raw.get("From") or ""
305
  from_name, from_email = parse_name_email(sender)
306
  from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
307
+ to_emails = parse_multi_emails(raw.get("to") or "") + parse_multi_emails(raw.get("cc") or "")
308
 
309
  subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
310
 
 
340
  "from_name": from_name,
341
  "from_email": from_email,
342
  "from_domain": from_domain,
343
+ "to_emails": to_emails,
344
  "subject": subject_norm,
345
  "body_text": body_text,
346
  "lang": lang,
347
+ "attachments": attach_names,
348
  "text_hash": text_hash,
349
  }
350
 
 
362
  if "wire" in low or "transfer" in low or "cash" in low:
363
  if "finance" not in tags:
364
  tags.append("finance")
365
+ if OFFCHANNEL_RE.search(low) or EVASIVE_ACRO_RE.search(low):
366
+ tags.append("off-channel")
367
  return tags
368
 
369
  def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
 
379
  df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
380
  return df
381
 
382
+ # Visual highlight helpers
383
+ def _compile_highlight_terms(row: pd.Series, extra_terms: List[str]) -> List[str]:
384
+ terms = []
385
+ txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
386
+ # suspect phrases found in this row
387
+ for p in SUSPECT_PHRASES:
388
+ if p in txt:
389
+ terms.append(p)
390
+ # entity markers
391
+ if MONEY_RE.search(txt): terms.append("$")
392
+ if INVOICE_RE.search(txt): terms.append("invoice")
393
+ # regex-based (keep as literal samples)
394
+ if PHONE_RE.search(row.get("body_text","") or ""): terms.append("phone")
395
+ # extras from user input
396
+ for t in extra_terms or []:
397
+ t=t.strip()
398
+ if t and t.lower() in txt:
399
+ terms.append(t)
400
+ # dedupe
401
+ out, seen = [], set()
402
+ for t in terms:
403
+ if t.lower() not in seen:
404
+ out.append(t)
405
+ seen.add(t.lower())
406
+ return out[:24]
407
+
408
+ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None,
409
+ cluster_label: Optional[str] = None,
410
+ do_highlight: bool = True,
411
+ extra_terms: Optional[List[str]] = None) -> str:
412
  subject = (row.get("subject") or "").strip()
413
  body = (row.get("body_text") or "").strip()
414
  from_email = row.get("from_email") or ""
415
  date = row.get("date") or ""
416
  tags = row.get("tags") or []
417
+ flags = row.get("flags") or []
418
  sentiment = row.get("sentiment") or "(unknown)"
419
 
420
+ hl_terms = []
421
+ if do_highlight:
422
+ hl_terms = (query_terms or []) + _compile_highlight_terms(row, extra_terms or [])
423
+ # make unique, case-insensitive
424
+ seen=set(); uniq=[]
425
+ for t in hl_terms:
426
+ tl=t.lower()
427
+ if tl and tl not in seen:
428
+ uniq.append(t); seen.add(tl)
429
+ hl_terms = uniq[:24]
430
+
431
  def hi(text: str) -> str:
432
+ if not text or not do_highlight or not hl_terms:
433
  return text
434
  out = text
435
+ for qt in hl_terms:
436
  if not qt:
437
  continue
438
  try:
 
449
  dir_attr = ' dir="rtl"' if rtl else ""
450
  body_html = body_h.replace("\n", "<br/>")
451
 
452
+ def pill(s, cls="tag"):
453
+ return f'<span class="{cls}">{s}</span>'
454
+
455
  tag_html = ""
456
+ pills = []
457
  if isinstance(tags, list) and tags:
458
+ pills += [pill(t, "tag") for t in tags]
459
+ if isinstance(flags, list) and flags:
460
+ pills += [pill(f, "tag") for f in flags]
461
+ if pills:
462
+ tag_html = " ".join(pills)
463
 
464
  cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
465
 
 
492
  self.avgdl_ = None
493
 
494
  def fit(self, X):
 
495
  N = X.shape[0]
 
496
  df = np.bincount(X.tocsc().indices, minlength=X.shape[1]).astype(np.float64)
497
  self.idf_ = np.log((N - df + 0.5) / (df + 0.5 + 1e-12))
498
  dl = np.asarray(X.sum(axis=1)).ravel()
 
512
  data[i] = (self.idf_[cols[i]] * (tf * (k1 + 1))) / (denom + 1e-12)
513
  return X
514
 
 
515
  def enrich_text(row: pd.Series) -> str:
516
  subj = row.get("subject","") or ""
517
  body = row.get("body_text","") or ""
 
521
  if PHONE_RE.search(t): tokens.append("__HAS_PHONE__")
522
  if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
523
  if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
524
+ if OFFCHANNEL_RE.search(t): tokens.append("__OFF_CHANNEL__")
525
  return (t + " " + " ".join(tokens)).strip()
526
 
527
  # =================== Cluster labeling: PMI bigrams ===================
 
555
  labels_out[c] = ", ".join(top) if top else f"cluster_{c}"
556
  return labels_out
557
 
558
+ # =================== Auto-k & merge ===================
559
  def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
560
  n = X.shape[0]
561
  if n > 40000:
 
579
  return k_best, dict(zip(ks, inertias))
580
 
581
  def auto_k_rule(n_docs: int) -> int:
 
582
  return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
583
 
 
584
  def merge_close_clusters(labels, centers, thresh=0.92):
585
  centers = sk_normalize(centers)
586
  sim = cosine_similarity(centers, centers)
 
604
  labels2 = np.array([idmap[root[int(c)]] for c in labels], dtype=int)
605
  return labels2
606
 
 
607
  def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVectorizer,
608
  lsa_components: np.ndarray, norm_obj: Normalizer,
609
  d_word: int, d_full: int, k: int) -> Optional[np.ndarray]:
 
610
  seeds_word = []
611
  vocab = count_vec.vocabulary_
612
  for _, words in lexicons.items():
 
621
  seeds_word.append(v)
622
  if not seeds_word:
623
  return None
 
624
  seeds_full = []
625
  for v in seeds_word:
626
  vf = np.zeros((d_full,), dtype=np.float32)
627
  vf[:d_word] = v
628
  seeds_full.append(vf)
629
+ seeds_full = np.stack(seeds_full, axis=0)
630
+ seeds_red = seeds_full @ lsa_components.T
 
631
  seeds_red = norm_obj.transform(seeds_red.astype(np.float32))
 
 
632
  if seeds_red.shape[0] >= 2 and seeds_red.shape[0] <= k:
633
  return seeds_red
634
  return None
635
 
636
+ # =================== Scoring & Flags ===================
637
+ def _hour_of(dt_iso: str) -> Optional[int]:
638
+ try:
639
+ if not dt_iso: return None
640
+ dt = pd.to_datetime(dt_iso, utc=True, errors="coerce")
641
+ if pd.isna(dt): return None
642
+ # treat UTC for lack of per-user tz; still useful as "odd hour"
643
+ return int(dt.hour)
644
+ except Exception:
645
+ return None
646
+
647
+ def _attachment_flags(names: List[str]) -> List[str]:
648
+ flags=[]
649
+ for n in names or []:
650
+ if ATTACH_NAME_RE.search(n):
651
+ flags.append("📎 " + n[:40])
652
+ return flags[:5]
653
+
654
+ def corruption_score(row, trusted_domains: set):
655
  score = 0.0
656
  txt = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
657
  for ph in SUSPECT_PHRASES:
658
  if ph in txt:
659
  score += 2.0
660
  break
661
+ if EVASIVE_ACRO_RE.search(txt) or OFFCHANNEL_RE.search(txt):
662
+ score += 1.0
663
  if isinstance(row.get("tags"), list) and ("🚩suspect" in row["tags"] or "finance" in row["tags"]):
664
  score += 1.5
665
  if MONEY_RE.search(txt): score += 0.7
 
669
  body_len = len(row.get("body_text",""))
670
  if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
671
  score += 0.5
672
+ # personal/off-channel via headers
673
+ fd = (row.get("from_domain") or "").lower()
674
+ if fd in PERSONAL_DOMAINS and fd not in trusted_domains:
675
+ score += 0.5
676
+ # odd hours
677
+ h = _hour_of(row.get("date") or "")
678
+ if h is not None and (h < 6 or h > 22):
679
+ score += 0.3
680
  return score
681
 
682
  # =================== Gradio UI ===================
 
695
  mark { background:#fff59d; color:#111827; padding:0 2px; border-radius:2px; }
696
  hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
697
  .small { color:#475569; font-size:12px; }
698
+ .cursor { cursor:pointer; }
699
  """
700
 
701
  with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
702
+ gr.Markdown("# Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans")
 
 
 
703
 
704
  with gr.Row():
705
  inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
 
719
  mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
720
  with gr.Row():
721
  use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
722
+ use_iso = gr.Checkbox(label="Compute anomaly score (IsolationForest on LSA)", value=False)
723
 
724
+ with gr.Accordion("Investigation Controls", open=True):
725
  with gr.Row():
726
+ trusted_domains_in = gr.Textbox(label="Trusted org domains (comma-separated)", value="example.gov, example.org")
727
+ extra_keywords_in = gr.Textbox(label="Extra suspicious phrases (comma-separated)", value="")
728
+ highlight_toggle = gr.Checkbox(label="Highlight suspect patterns in reader", value=True)
 
 
729
  with gr.Row():
730
+ cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
731
+ domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
732
+ sender_drop = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
733
+ lang_drop = gr.Dropdown(label="Language", choices=["(any)"], value="(any)", allow_custom_value=False)
734
+ sentiment_drop = gr.Dropdown(label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)")
735
+ tag_drop = gr.Dropdown(label="Tag", choices=["(any)", "🚩suspect", "finance", "off-channel", "odd-hours", "personal-mail"], value="(any)")
736
  with gr.Row():
737
  date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
738
  date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
739
+ sort_by = gr.Dropdown(label="Sort by", choices=["corruption_score","date","anomaly_score","search_score"], value="corruption_score")
740
+ sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
741
 
742
  with gr.Row():
743
  run_btn = gr.Button("Process", variant="primary")
 
745
  status = gr.Markdown("")
746
 
747
  with gr.Row():
748
+ cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
749
  domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
750
 
751
+ with gr.Row():
752
+ actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
753
+ offhours_df = gr.Dataframe(label="Off-hours & personal-mail hits", interactive=False, wrap=True)
754
+
755
  gr.Markdown("### Search")
756
  with gr.Row():
757
  search_query = gr.Textbox(label="Search (keywords, names, etc.)")
 
760
  email_view = gr.HTML(label="Reader")
761
 
762
  # State
763
+ state_df = gr.State()
764
+ state_vec = gr.State()
765
+ state_X_reduced = gr.State()
766
+ state_index = gr.State()
767
+ state_term_names = gr.State()
768
+ state_query_terms = gr.State()
769
  state_use_lsa = gr.State()
770
  state_use_faiss = gr.State()
771
  state_svd = gr.State()
772
  state_norm = gr.State()
773
+ state_dims = gr.State()
774
+ state_extra_terms = gr.State()
775
+ state_highlight = gr.State()
776
 
777
  # -------- IO helpers --------
778
  def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
 
807
  df: pd.DataFrame,
808
  cluster: Optional[str],
809
  domain: Optional[str],
810
+ sender: Optional[str],
811
+ lang_value: str,
812
  sentiment: str,
813
  tag_value: str,
814
  start: str,
 
822
  out = out[out["cluster_id"] == cid]
823
  if domain and domain != "(any)":
824
  out = out[out["from_domain"] == domain]
825
+ if sender and sender != "(any)":
826
+ out = out[out["from_email"] == sender]
827
+ if lang_value and lang_value != "(any)":
828
+ out = out[out["lang"] == lang_value]
829
  if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
830
  out = out[out["sentiment"].astype(str) == sentiment]
831
  if tag_value and tag_value != "(any)":
832
+ out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))
833
+ | out["flags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
834
  if start:
835
  try:
836
  dt = pd.to_datetime(start, utc=True, errors="coerce")
 
845
  pass
846
  return out
847
 
848
+ # -------- Simple social network stats --------
849
+ def social_stats(df: pd.DataFrame) -> pd.DataFrame:
850
+ # degree = unique counterparts per address (from <-> each to/cc)
851
+ deg = {}
852
+ def add_edge(a,b):
853
+ if not a or not b or a==b: return
854
+ deg.setdefault(a,set()).add(b)
855
+ deg.setdefault(b,set()).add(a)
856
+ for _, r in df.iterrows():
857
+ f = r.get("from_email") or ""
858
+ tos = r.get("to_emails") or []
859
+ for t in tos:
860
+ add_edge(f, t)
861
+ rows=[]
862
+ for addr, nbrs in deg.items():
863
+ rows.append({"address": addr, "degree": len(nbrs)})
864
+ out = pd.DataFrame(rows).sort_values("degree", ascending=False).head(50)
865
+ return out
866
+
867
  # -------- Main pipeline --------
868
  def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
869
+ use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
870
+ trusted_domains_in, extra_keywords_in, highlight_toggle):
871
  if inbox_file is None:
872
  return ("**Please upload a file.**",
873
+ None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
874
+ None, None, None, None)
875
 
876
  use_lang = not bool(skip_lang)
877
 
878
+ # trusted org domains
879
+ trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
880
+ extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
881
+ # extend SUSPECT_PHRASES runtime (no mutation of constant list)
882
+ extra_terms_lower = [t.lower() for t in extra_terms]
883
+
884
  recs = _load_json_records(inbox_file.name)
885
  if not recs:
886
  return ("**No valid records found.**",
887
+ None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
888
+ None, None, None, None)
889
 
890
  # Normalize
891
  normd = []
 
896
  df = pd.DataFrame(normd)
897
  if df.empty:
898
  return ("**No usable email records after normalization.**",
899
+ None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
900
+ None, None, None, None)
901
 
902
  # Deduplicate conservatively
903
  df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
904
 
905
+ # Tags + Sentiment
906
  df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
907
  df = compute_sentiment_column(df)
908
 
909
+ # Row-level flags (off-hours, personal-mail, attachments)
910
+ flags = []
911
+ for _, row in df.iterrows():
912
+ f = []
913
+ h = _hour_of(row.get("date") or "")
914
+ if h is not None and (h < 6 or h > 22):
915
+ f.append("odd-hours")
916
+ fd = (row.get("from_domain") or "").lower()
917
+ if (fd in PERSONAL_DOMAINS) and (fd not in trusted):
918
+ f.append("personal-mail")
919
+ # attachment names of interest
920
+ f += _attachment_flags(row.get("attachments") or [])
921
+ flags.append(f)
922
+ df["flags"] = flags
923
+
924
  # Enriched texts (adds __HAS_*__ flags)
925
  texts = list(df.apply(enrich_text, axis=1))
926
 
927
+ # === Vectorization ===
928
  ngram_range = (1, 2) if use_bigrams else (1, 1)
929
  count_vec = CountVectorizer(
930
  analyzer="word",
 
951
  d_char = X_char.shape[1]
952
  d_full = X_full.shape[1]
953
 
954
+ # LSA
955
  use_lsa = bool(use_lsa)
956
  X_reduced = None
957
  svd_obj = None
958
  norm_obj = None
959
  if use_lsa:
960
  svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
961
+ X_reduced_tmp = svd_obj.fit_transform(X_full) # dense
962
  norm_obj = Normalizer(copy=False)
963
  X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
964
  del X_reduced_tmp
965
  gc.collect()
966
 
967
+ # Optional anomaly detection (on LSA space)
968
+ anomaly_scores = np.full((len(df),), np.nan, dtype=np.float32)
969
+ if use_lsa and bool(use_iso) and ISO_OK and X_reduced is not None and X_reduced.shape[0] >= 50:
970
+ try:
971
+ iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
972
+ iso.fit(X_reduced)
973
+ # higher is less anomalous; convert to anomaly score = -score
974
+ anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32)
975
+ except Exception:
976
+ pass
977
+ df["anomaly_score"] = anomaly_scores
978
+
979
  # K selection
980
  if bool(auto_k):
981
  if use_lsa:
982
  k, _ = choose_k_by_kneedle(X_reduced, ks=(50,100,150,200,300,400,500))
983
  else:
 
984
  k = auto_k_rule(X_full.shape[0])
985
  else:
986
  k = max(10, int(k_clusters or 350))
987
 
988
+ # Optional seeded init (only in LSA space)
989
  init = None
990
  if use_lsa:
991
  seeds = seeded_centroids_in_lsa(
992
  CORR_LEX, count_vec, svd_obj.components_, norm_obj,
993
  d_word=d_word, d_full=d_full, k=k
994
  )
995
+ if seeds is not None and seeds.shape[0] == k:
996
+ init = seeds
 
 
 
 
997
 
998
  # KMeans clustering (use LSA space if enabled)
999
  X_space = (X_reduced if use_lsa else X_full)
 
1006
  )
1007
  labels = kmeans.fit_predict(X_space)
1008
 
1009
+ # Merge very-similar clusters (LSA only)
1010
  if use_lsa:
1011
  labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.92)
1012
 
1013
  df["cluster_id"] = labels
1014
 
1015
+ # Cluster names
1016
  term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
1017
  df["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
1018
 
1019
+ # CorruptionScore (now uses trusted domains)
1020
+ df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
1021
 
1022
  # Build search index
1023
  use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
 
1051
  )
1052
  domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
1053
 
1054
+ sender_counts = (
1055
+ df.groupby("from_email").size()
1056
+ .reset_index(name="count")
1057
+ .sort_values("count", ascending=False)
1058
+ .head(200)
1059
+ )
1060
+ sender_choices = ["(any)"] + sender_counts["from_email"].tolist()
1061
+
1062
+ # Languages present
1063
+ langs = [l for l in sorted(df["lang"].dropna().unique()) if l and l!="unknown"]
1064
+ lang_choices = ["(any)"] + langs
1065
+
1066
+ # Social stats
1067
+ actors = social_stats(df)
1068
+
1069
+ # Off-hours & personal mail table
1070
+ offp = df[(df["flags"].apply(lambda xs: "odd-hours" in (xs or []))) | (df["flags"].apply(lambda xs: "personal-mail" in (xs or [])))]
1071
+ offhours_table = offp[["date","from_email","from_domain","subject","flags","corruption_score"]].sort_values("corruption_score", ascending=False).head(200)
1072
+
1073
+ # Default results (sorted)
1074
  show_df = df.copy()
1075
  if "date" in show_df.columns and show_df["date"].notna().any():
1076
  show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
1077
  else:
1078
  show_df["_dt"] = pd.NaT
1079
  show_df = show_df.sort_values(["corruption_score","_dt"], ascending=[False, False]).drop(columns=["_dt"])
1080
+ cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
 
1081
  out_table = show_df[cols_out].head(500)
1082
 
1083
  vec_state = {"count_vec": count_vec, "char_vec": char_vec, "bm25": bm25}
 
1086
  f"**Processed {len(df):,} emails** \n"
1087
  f"Word feats (BM25): {d_word:,} | Char feats: {d_char:,} | Total: {d_full:,} \n"
1088
  f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
1089
+ f"k = {k} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'} | "
1090
+ f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
1091
  )
1092
 
1093
  gc.collect()
1094
 
1095
  cluster_update = gr.update(choices=cluster_choices, value="(any)")
1096
  domain_update = gr.update(choices=domain_choices, value="(any)")
1097
+ sender_update = gr.update(choices=sender_choices, value="(any)")
1098
+ lang_update = gr.update(choices=lang_choices, value="(any)")
1099
 
1100
  return (
1101
  status_md,
1102
  cluster_counts, domain_counts,
1103
+ actors, offhours_table,
1104
  out_table,
1105
  df, vec_state, (X_reduced if use_lsa else None), index_obj, term_names,
1106
  use_lsa, bool(use_faiss),
1107
+ cluster_update, domain_update, sender_update, lang_update,
1108
  svd_obj, norm_obj,
1109
+ (d_word, d_char),
1110
+ extra_terms_lower, bool(highlight_toggle)
1111
  )
1112
 
1113
  (run_btn.click)(
1114
  process_file,
1115
  inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
1116
+ use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
1117
+ trusted_domains_in, extra_keywords_in, highlight_toggle],
1118
  outputs=[status,
1119
  cluster_counts_df, domain_counts_df,
1120
+ actors_df, offhours_df,
1121
  results_df,
1122
  state_df, state_vec, state_X_reduced, state_index, state_term_names,
1123
  state_use_lsa, state_use_faiss,
1124
+ cluster_drop, domain_drop, sender_drop, lang_drop,
1125
  state_svd, state_norm,
1126
+ state_dims,
1127
+ state_extra_terms, state_highlight]
1128
  )
1129
 
1130
  # -------- Filtering & Search --------
1131
+ def _sort_results(df, by, direction):
1132
+ if df is None or len(df)==0:
1133
  return pd.DataFrame()
1134
+ tmp = df.copy()
1135
+ if "date" in tmp.columns:
 
 
1136
  tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
1137
+ else:
1138
+ tmp["_dt"] = pd.NaT
1139
+ by = by or "corruption_score"
1140
+ asc = (direction == "asc")
1141
+ if by == "date":
1142
+ tmp = tmp.sort_values(["_dt"], ascending=asc)
1143
+ elif by == "anomaly_score" and "anomaly_score" in tmp.columns:
1144
+ tmp = tmp.sort_values(["anomaly_score","_dt"], ascending=[asc, not asc])
1145
+ else:
1146
+ # corruption_score or search_score (if present)
1147
+ col = by if by in tmp.columns else "corruption_score"
1148
+ tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
1149
+ tmp = tmp.drop(columns=["_dt"])
1150
+ cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
1151
+ acc = [c for c in cols_out if c in tmp.columns]
1152
+ return tmp[acc].head(500)
1153
+
1154
+ def refresh_results(df, cluster_choice, domain_choice, sender_choice, lang_choice, sentiment_choice, tag_choice, start, end, sort_by, sort_dir):
1155
+ if df is None or len(df) == 0:
1156
+ return pd.DataFrame()
1157
+ filt = _apply_filters(df, cluster_choice, domain_choice, sender_choice, lang_choice, sentiment_choice, tag_choice, start, end)
1158
+ return _sort_results(filt, sort_by, sort_dir)
1159
 
1160
+ for ctrl in [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir]:
1161
  ctrl.change(
1162
  refresh_results,
1163
+ inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
1164
  outputs=[results_df]
1165
  )
1166
 
1167
  reset_btn.click(
1168
+ lambda: ["(any)", "(any)", "(any)", "(any)", "(any)", "(any)", "", "", "corruption_score", "desc"],
1169
  inputs=[],
1170
+ outputs=[cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir]
1171
  ).then(
1172
  refresh_results,
1173
+ inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
1174
  outputs=[results_df]
1175
  )
1176
 
 
1203
  q_full = hstack([q_word, q_char], format="csr")
1204
  return q_full
1205
 
1206
+ def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
1207
  if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
1208
  return pd.DataFrame(), []
1209
  q_terms = _tokenize_query(q)
 
1220
  inds = indices[0]
1221
  sims = 1.0 - distances[0]
1222
  results = df.iloc[inds].copy()
1223
+ results["search_score"] = sims
1224
  elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
1225
  D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df)))
1226
  inds = I[0]
1227
  sims = D[0]
1228
  results = df.iloc[inds].copy()
1229
+ results["search_score"] = sims
1230
  else:
1231
  return pd.DataFrame(), q_terms
1232
 
1233
+ # blend with corruption score lightly
 
1234
  cs = results["corruption_score"].fillna(0.0)
1235
  cs = (cs - cs.min()) / (cs.max() - cs.min() + 1e-9)
1236
+ results["_blend"] = 0.7*results["search_score"].values + 0.3*cs.values
1237
+ # sort UI-selected way
1238
+ if sort_by == "search_score":
1239
+ results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
1240
+ else:
1241
+ # use blended but keep sort_by if chosen
1242
+ if sort_by in results.columns:
1243
+ results = results.sort_values([sort_by,"_blend"], ascending=[(sort_dir=="asc"), False])
1244
+ else:
1245
+ results = results.sort_values("_blend", ascending=(sort_dir=="asc"))
1246
+ results = results.drop(columns=["_blend"])
1247
+ cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score", "search_score"]
1248
  return results[cols].head(50), q_terms
1249
 
1250
  search_btn.click(
1251
  search_fn,
1252
+ inputs=[search_query, state_df, state_vec, state_X_reduced, state_index, state_use_lsa, state_use_faiss, state_svd, state_norm, sort_by, sort_dir],
1253
  outputs=[results_df, state_query_terms]
1254
  )
1255
 
1256
+ def on_row_select(evt: gr.SelectData, table: pd.DataFrame, df: pd.DataFrame, term_names: Dict[int, str],
1257
+ query_terms: Optional[List[str]], extra_terms: List[str], do_highlight: bool):
1258
  try:
1259
  row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
1260
  except Exception:
 
1279
  row = cand.iloc[0]
1280
  cid = int(row.get("cluster_id", -1))
1281
  clabel = term_names.get(cid, f"cluster_{cid}") if term_names else None
1282
+ return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel, do_highlight=bool(do_highlight), extra_terms=extra_terms)
1283
 
1284
  results_df.select(
1285
  on_row_select,
1286
+ inputs=[results_df, state_df, state_term_names, state_query_terms, state_extra_terms, state_highlight],
1287
  outputs=[email_view]
1288
  )
1289
 
1290
+ # Click cluster summary to filter
1291
+ def on_cluster_click(evt: gr.SelectData, df_sum: pd.DataFrame):
1292
+ try:
1293
+ row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
1294
+ except Exception:
1295
+ row_idx = evt.index if hasattr(evt, "index") else None
1296
+ if row_idx is None or df_sum is None or len(df_sum)==0:
1297
+ return "(any)"
1298
+ label = df_sum.iloc[row_idx]["label"]
1299
+ return label if isinstance(label, str) else "(any)"
1300
+
1301
+ cluster_counts_df.select(
1302
+ on_cluster_click,
1303
+ inputs=[cluster_counts_df],
1304
+ outputs=[cluster_drop]
1305
+ ).then(
1306
+ refresh_results,
1307
+ inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
1308
+ outputs=[results_df]
1309
+ )
1310
+
1311
  if __name__ == "__main__":
1312
  demo.launch()