Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Sep 1, 2025

Commit

d4eefe7

verified ·

1 Parent(s): d5cce46

Update app.py

Browse files

Files changed (1) hide show

app.py +383 -135

app.py CHANGED Viewed

@@ -25,6 +25,13 @@ from sklearn.preprocessing import Normalizer
 from sklearn.preprocessing import normalize as sk_normalize
 from sklearn.metrics.pairwise import cosine_similarity
 from scipy.sparse import hstack
 # Optional fast ANN (CPU)
@@ -34,7 +41,7 @@ try:
 except Exception:
     FAISS_OK = False
-# Optional, but strongly recommended (tiny + fast)
 try:
     from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
     VADER_OK = True
@@ -42,44 +49,59 @@ except Exception:
     VADER_OK = False
 # =================== Regex & Flags ===================
-# Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
 TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
-# URLs -> "URL" (reduce feature bloat).
 URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
-# Quote lines ("> ...")
 QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
-# Signature separator: lines after "-- " (standard)
 SIG_RE = re.compile(r"\n-- ?\n", re.M)
-# Device footers
 SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
 HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
-# Forward/quoted markers
 FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
 FWD_MSG_RE   = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
 ON_WROTE_RE  = re.compile(r'^\s*On .* wrote:$', re.M)
-# Toggle for language detection (skip for speed)
 SKIP_LANGDETECT = True
-# Corruption keyword/phrase list (you can extend freely)
 SUSPECT_PHRASES = [
-    "off the books", "cover up", "kickback", "bribe", "under the table",
-    "no inspection", "special fee", "friendly payment", "confidential deal",
-    "nobody will find out", "pay to play", "cash only", "shell company",
-    "bid rigging", "embezzle", "slush fund", "false invoice", "ghost employee",
-    "contract splitting", "grease payment", "unreported", "unrecorded",
 ]
-# Entity regexes for enrichment/scoring
 MONEY_RE   = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
 PHONE_RE   = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
-INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order)\b', re.I)
 COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
 # Optional seeded themes for semi-supervised init (used only when LSA is ON)
 CORR_LEX = {
@@ -89,19 +111,15 @@ CORR_LEX = {
     "money_flow"    : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
 }
-# =================== Label cleanup helpers ===================
 EN_STOP = {
     "the","of","and","to","in","is","for","on","at","with","from","by","or","as",
     "that","this","it","be","are","was","were","an","a","you","your","we","our","us",
     "re","fwd","fw","hi","hello","thanks","thank","regards","best","please","dear","mr","mrs",
     "message","original","forwarded","attached","attachment","confidential","notice","disclaimer",
-    "herein","thereof","hereby","therein","regarding","subject","url","via","kind","regard",
-    "ny"
-}
-HE_STOP = {
-    "של","על","זה","גם","אם","לא","את","אתה","אני","הוא","היא","הם","הן","כי","מה",
-    "שלום","תודה","בברכה","מצורף","הודעה","קדימה","היי"
 }
 MONTHS = {
     "jan","feb","mar","apr","may","jun","jul","aug","sep","sept","oct","nov","dec",
     "january","february","march","april","june","july","august","september",
@@ -116,14 +134,10 @@ def _is_junk_term(t: str) -> bool:
     tl = t.lower()
     if tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
         return True
-    if EMAIL_LIKE_RE.search(tl):
-        return True
-    if YEAR_RE.match(tl):
-        return True
-    if NUMERIC_RE.match(tl):
-        return True
-    if ONE_CHAR_RE.match(tl):
-        return True
     return False
 def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
@@ -147,7 +161,7 @@ def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarra
                 break
     return cleaned
-# =================== HTML/Text Cleanup ===================
 def html_to_text(html: str) -> str:
     if not html:
         return ""
@@ -183,10 +197,19 @@ def parse_name_email(s: str) -> Tuple[str, str]:
         return (m.group(1) or "").strip(), (m.group(2) or "").strip()
     return "", s.strip()
 def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
     headers: Dict[str, str] = {}
     lines = (text or "").splitlines()
-    header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject):')
     i = 0
     saw_header = False
     while i < len(lines):
@@ -234,15 +257,27 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
     if str(raw.get("type", "")).lower() == "meta":
         return {}
     body_text_raw = raw.get("body_text") or raw.get("text") or ""
     html_content  = raw.get("body_html") or raw.get("html") or ""
     if html_content and not body_text_raw:
         body_text_raw = html_to_text(html_content)
     body_text_raw = ftfy.fix_text(body_text_raw or "")
     subject_text = ""
     from_name = from_email = from_domain = ""
     date_val = raw.get("date") or raw.get("Date") or ""
     if body_text_raw:
@@ -250,6 +285,8 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
         subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
         sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
         date_val = headers.get("Date", "") or date_val
         body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
         body_clean = URL_RE.sub(" URL ", body_clean)
@@ -267,6 +304,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
         sender = raw.get("from") or raw.get("From") or ""
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
     subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
@@ -302,9 +340,11 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
         "from_name": from_name,
         "from_email": from_email,
         "from_domain": from_domain,
         "subject": subject_norm,
         "body_text": body_text,
         "lang": lang,
         "text_hash": text_hash,
     }
@@ -322,6 +362,8 @@ def has_suspect_tag(text: str) -> List[str]:
     if "wire" in low or "transfer" in low or "cash" in low:
         if "finance" not in tags:
             tags.append("finance")
     return tags
 def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
@@ -337,19 +379,60 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
     df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
     return df
-def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str] = None) -> str:
     subject = (row.get("subject") or "").strip()
     body    = (row.get("body_text") or "").strip()
     from_email = row.get("from_email") or ""
     date    = row.get("date") or ""
     tags    = row.get("tags") or []
     sentiment = row.get("sentiment") or "(unknown)"
     def hi(text: str) -> str:
-        if not text or not query_terms:
             return text
         out = text
-        for qt in query_terms:
             if not qt:
                 continue
             try:
@@ -366,9 +449,17 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
     dir_attr = ' dir="rtl"' if rtl else ""
     body_html = body_h.replace("\n", "<br/>")
     tag_html = ""
     if isinstance(tags, list) and tags:
-        tag_html = " ".join([f'<span class="tag">{t}</span>' for t in tags])
     cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
@@ -401,9 +492,7 @@ class BM25Transformer:
         self.avgdl_ = None
     def fit(self, X):
-        # X is term-frequency (CountVectorizer)
         N = X.shape[0]
-        # document frequency per term
         df = np.bincount(X.tocsc().indices, minlength=X.shape[1]).astype(np.float64)
         self.idf_ = np.log((N - df + 0.5) / (df + 0.5 + 1e-12))
         dl = np.asarray(X.sum(axis=1)).ravel()
@@ -423,7 +512,6 @@ class BM25Transformer:
             data[i] = (self.idf_[cols[i]] * (tf * (k1 + 1))) / (denom + 1e-12)
         return X
-# Add enrichment tokens to help the model lock onto key signals
 def enrich_text(row: pd.Series) -> str:
     subj = row.get("subject","") or ""
     body = row.get("body_text","") or ""
@@ -433,6 +521,7 @@ def enrich_text(row: pd.Series) -> str:
     if PHONE_RE.search(t):   tokens.append("__HAS_PHONE__")
     if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
     if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
     return (t + " " + " ".join(tokens)).strip()
 # =================== Cluster labeling: PMI bigrams ===================
@@ -466,7 +555,7 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
         labels_out[c] = ", ".join(top) if top else f"cluster_{c}"
     return labels_out
-# =================== Auto-k (Kneedle on inertia) ===================
 def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
     n = X.shape[0]
     if n > 40000:
@@ -490,10 +579,8 @@ def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
     return k_best, dict(zip(ks, inertias))
 def auto_k_rule(n_docs: int) -> int:
-    # Sublinear scaling; keeps clusters between ~120 and 600 for big corpora
     return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
-# =================== Merge close clusters (LSA space only to save RAM) ===================
 def merge_close_clusters(labels, centers, thresh=0.92):
     centers = sk_normalize(centers)
     sim = cosine_similarity(centers, centers)
@@ -517,11 +604,9 @@ def merge_close_clusters(labels, centers, thresh=0.92):
     labels2 = np.array([idmap[root[int(c)]] for c in labels], dtype=int)
     return labels2
-# =================== Seeded centroids (only if LSA enabled) ===================
 def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVectorizer,
                             lsa_components: np.ndarray, norm_obj: Normalizer,
                             d_word: int, d_full: int, k: int) -> Optional[np.ndarray]:
-    # Build a few unit vectors in word-term space based on lexicons
     seeds_word = []
     vocab = count_vec.vocabulary_
     for _, words in lexicons.items():
@@ -536,30 +621,45 @@ def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVect
             seeds_word.append(v)
     if not seeds_word:
         return None
-    # Lift to full feature space (word + char) by padding zeros for char dims
     seeds_full = []
     for v in seeds_word:
         vf = np.zeros((d_full,), dtype=np.float32)
         vf[:d_word] = v
         seeds_full.append(vf)
-    seeds_full = np.stack(seeds_full, axis=0)  # (s, n_features)
-    # Project to LSA space: x @ components_.T then normalize
-    seeds_red = seeds_full @ lsa_components.T   # (s, lsa_dim)
     seeds_red = norm_obj.transform(seeds_red.astype(np.float32))
-    # If fewer than k seeds, KMeans will accept; scikit-learn requires init shape == (k, d)
-    # We’ll return only if seeds count >= 2 to be meaningful; otherwise None
     if seeds_red.shape[0] >= 2 and seeds_red.shape[0] <= k:
         return seeds_red
     return None
-# =================== Corruption scoring ===================
-def corruption_score(row):
     score = 0.0
     txt = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
     for ph in SUSPECT_PHRASES:
         if ph in txt:
             score += 2.0
             break
     if isinstance(row.get("tags"), list) and ("🚩suspect" in row["tags"] or "finance" in row["tags"]):
         score += 1.5
     if MONEY_RE.search(txt):   score += 0.7
@@ -569,6 +669,14 @@ def corruption_score(row):
     body_len = len(row.get("body_text",""))
     if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
         score += 0.5
     return score
 # =================== Gradio UI ===================
@@ -587,13 +695,11 @@ CSS = """
 mark { background:#fff59d; color:#111827; padding:0 2px; border-radius:2px; }
 hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
 .small { color:#475569; font-size:12px; }
 """
 with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
-    gr.Markdown("""
-    # Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans
-    **Goal:** quickly surface potentially corruption-related emails via topic clusters, tags, corruption score, and sentiment.
-    """)
     with gr.Row():
         inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
@@ -613,21 +719,25 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
         with gr.Row():
             use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
-    with gr.Accordion("Filters", open=True):
         with gr.Row():
-            cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
-            domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
-            sentiment_drop = gr.Dropdown(
-                label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)"
-            )
         with gr.Row():
-            tag_drop = gr.Dropdown(
-                label="Tag", choices=["(any)", "🚩suspect", "finance"], value="(any)"
-            )
         with gr.Row():
             date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
             date_end   = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
     with gr.Row():
         run_btn = gr.Button("Process", variant="primary")
@@ -635,9 +745,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     status = gr.Markdown("")
     with gr.Row():
-        cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
         domain_counts_df  = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
     gr.Markdown("### Search")
     with gr.Row():
         search_query = gr.Textbox(label="Search (keywords, names, etc.)")
@@ -646,17 +760,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     email_view = gr.HTML(label="Reader")
     # State
-    state_df          = gr.State()          # full dataframe
-    state_vec         = gr.State()          # {"count_vec":..., "char_vec":..., "bm25":...}
-    state_X_reduced   = gr.State()          # np.ndarray (LSA normalized) or None
-    state_index       = gr.State()          # Faiss index or sklearn NN
-    state_term_names  = gr.State()          # dict cluster_id -> label
-    state_query_terms = gr.State()          # last search terms list
     state_use_lsa     = gr.State()
     state_use_faiss   = gr.State()
     state_svd         = gr.State()
     state_norm        = gr.State()
-    state_dims        = gr.State()          # (d_word, d_char)
     # -------- IO helpers --------
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
@@ -691,6 +807,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         df: pd.DataFrame,
         cluster: Optional[str],
         domain: Optional[str],
         sentiment: str,
         tag_value: str,
         start: str,
@@ -704,10 +822,15 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 out = out[out["cluster_id"] == cid]
         if domain and domain != "(any)":
             out = out[out["from_domain"] == domain]
         if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
             out = out[out["sentiment"].astype(str) == sentiment]
         if tag_value and tag_value != "(any)":
-            out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
         if start:
             try:
                 dt = pd.to_datetime(start, utc=True, errors="coerce")
@@ -722,19 +845,47 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 pass
         return out
     # -------- Main pipeline --------
     def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
-                     use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss):
         if inbox_file is None:
             return ("**Please upload a file.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
         use_lang = not bool(skip_lang)
         recs = _load_json_records(inbox_file.name)
         if not recs:
             return ("**No valid records found.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
         # Normalize
         normd = []
@@ -745,19 +896,35 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         df = pd.DataFrame(normd)
         if df.empty:
             return ("**No usable email records after normalization.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
         # Deduplicate conservatively
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
-        # Tags (suspect/finance) + Sentiment
         df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
         df = compute_sentiment_column(df)
         # Enriched texts (adds __HAS_*__ flags)
         texts = list(df.apply(enrich_text, axis=1))
-        # === Vectorization: BM25 word + char tf-idf, then optional LSA ===
         ngram_range = (1, 2) if use_bigrams else (1, 1)
         count_vec = CountVectorizer(
             analyzer="word",
@@ -784,42 +951,49 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         d_char = X_char.shape[1]
         d_full = X_full.shape[1]
-        # LSA (TruncatedSVD + Normalizer) for stability/quality
         use_lsa = bool(use_lsa)
         X_reduced = None
         svd_obj = None
         norm_obj = None
         if use_lsa:
             svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
-            X_reduced_tmp = svd_obj.fit_transform(X_full)  # dense (n_docs x lsa_dim)
             norm_obj = Normalizer(copy=False)
             X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
             del X_reduced_tmp
             gc.collect()
         # K selection
         if bool(auto_k):
             if use_lsa:
                 k, _ = choose_k_by_kneedle(X_reduced, ks=(50,100,150,200,300,400,500))
             else:
-                # fallback: heuristic rule on doc count
                 k = auto_k_rule(X_full.shape[0])
         else:
             k = max(10, int(k_clusters or 350))
-        # Optional seeded init (only in LSA space to keep memory sane)
         init = None
         if use_lsa:
             seeds = seeded_centroids_in_lsa(
                 CORR_LEX, count_vec, svd_obj.components_, norm_obj,
                 d_word=d_word, d_full=d_full, k=k
             )
-            if seeds is not None and seeds.shape[0] <= k:
-                # If fewer seeds than k, KMeans will handle by k-means++ for remaining centers internally only for KMeans.
-                # For MiniBatchKMeans, we must provide exactly k centers or fall back to k-means++.
-                # So use seeds only if seeds.shape[0] == k; otherwise None.
-                if seeds.shape[0] == k:
-                    init = seeds
         # KMeans clustering (use LSA space if enabled)
         X_space = (X_reduced if use_lsa else X_full)
@@ -832,18 +1006,18 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         )
         labels = kmeans.fit_predict(X_space)
-        # Optional: merge very-similar clusters (only when LSA enabled)
         if use_lsa:
             labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.92)
         df["cluster_id"] = labels
-        # Name clusters by PMI bigrams on raw enriched texts
         term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
         df["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
-        # CorruptionScore
-        df["corruption_score"] = df.apply(corruption_score, axis=1)
         # Build search index
         use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
@@ -877,15 +1051,33 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         )
         domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
-        # Results preview default: rank by corruption_score then date desc
         show_df = df.copy()
         if "date" in show_df.columns and show_df["date"].notna().any():
             show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
         else:
             show_df["_dt"] = pd.NaT
         show_df = show_df.sort_values(["corruption_score","_dt"], ascending=[False, False]).drop(columns=["_dt"])
-        cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment", "corruption_score"]
         out_table = show_df[cols_out].head(500)
         vec_state = {"count_vec": count_vec, "char_vec": char_vec, "bm25": bm25}
@@ -894,66 +1086,91 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             f"**Processed {len(df):,} emails**  \n"
             f"Word feats (BM25): {d_word:,}  |  Char feats: {d_char:,}  |  Total: {d_full:,}  \n"
             f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims  |  ' if use_lsa else ''}"
-            f"k = {k}  |  Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'}"
         )
         gc.collect()
         cluster_update = gr.update(choices=cluster_choices, value="(any)")
         domain_update  = gr.update(choices=domain_choices,  value="(any)")
         return (
             status_md,
             cluster_counts, domain_counts,
             out_table,
             df, vec_state, (X_reduced if use_lsa else None), index_obj, term_names,
             use_lsa, bool(use_faiss),
-            cluster_update, domain_update,
             svd_obj, norm_obj,
-            (d_word, d_char)
         )
     (run_btn.click)(
         process_file,
         inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
-                use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss],
         outputs=[status,
                  cluster_counts_df, domain_counts_df,
                  results_df,
                  state_df, state_vec, state_X_reduced, state_index, state_term_names,
                  state_use_lsa, state_use_faiss,
-                 cluster_drop, domain_drop,
                  state_svd, state_norm,
-                 state_dims]
     )
     # -------- Filtering & Search --------
-    def refresh_results(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end):
-        if df is None or len(df) == 0:
             return pd.DataFrame()
-        filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
-        cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment", "corruption_score"]
-        if "date" in filt.columns and filt["date"].notna().any():
-            tmp = filt.copy()
             tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
-            tmp = tmp.sort_values(["corruption_score","_dt"], ascending=[False, False]).drop(columns=["_dt"])
-            return tmp[cols_out].head(500)
-        return filt.sort_values(["corruption_score"], ascending=False)[cols_out].head(500)
-    for ctrl in [cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]:
         ctrl.change(
             refresh_results,
-            inputs=[state_df, cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end],
             outputs=[results_df]
         )
     reset_btn.click(
-        lambda: [None, None, "(any)", "(any)", "", ""],
         inputs=[],
-        outputs=[cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]
     ).then(
         refresh_results,
-        inputs=[state_df, cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end],
         outputs=[results_df]
     )
@@ -986,7 +1203,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         q_full    = hstack([q_word, q_char], format="csr")
         return q_full
-    def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj):
         if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
             return pd.DataFrame(), []
         q_terms = _tokenize_query(q)
@@ -1003,31 +1220,41 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             inds = indices[0]
             sims = 1.0 - distances[0]
             results = df.iloc[inds].copy()
-            results["score"] = sims
         elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
             D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df)))
             inds = I[0]
             sims = D[0]
             results = df.iloc[inds].copy()
-            results["score"] = sims
         else:
             return pd.DataFrame(), q_terms
-        cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment", "corruption_score", "score"]
-        # Rerank by a blend: 0.7 * ANN score + 0.3 * corruption_score (scaled)
         cs = results["corruption_score"].fillna(0.0)
         cs = (cs - cs.min()) / (cs.max() - cs.min() + 1e-9)
-        results["_blend"] = 0.7*results["score"].values + 0.3*cs.values
-        results = results.sort_values("_blend", ascending=False).drop(columns=["_blend"])
         return results[cols].head(50), q_terms
     search_btn.click(
         search_fn,
-        inputs=[search_query, state_df, state_vec, state_X_reduced, state_index, state_use_lsa, state_use_faiss, state_svd, state_norm],
         outputs=[results_df, state_query_terms]
     )
-    def on_row_select(evt: gr.SelectData, table: pd.DataFrame, df: pd.DataFrame, term_names: Dict[int, str], query_terms: Optional[List[str]]):
         try:
             row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
         except Exception:
@@ -1052,13 +1279,34 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         row = cand.iloc[0]
         cid = int(row.get("cluster_id", -1))
         clabel = term_names.get(cid, f"cluster_{cid}") if term_names else None
-        return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel)
     results_df.select(
         on_row_select,
-        inputs=[results_df, state_df, state_term_names, state_query_terms],
         outputs=[email_view]
     )
 if __name__ == "__main__":
     demo.launch()

 from sklearn.preprocessing import normalize as sk_normalize
 from sklearn.metrics.pairwise import cosine_similarity
+# Optional light anomaly detection
+try:
+    from sklearn.ensemble import IsolationForest
+    ISO_OK = True
+except Exception:
+    ISO_OK = False
 from scipy.sparse import hstack
 # Optional fast ANN (CPU)
 except Exception:
     FAISS_OK = False
+# Optional tiny sentiment
 try:
     from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
     VADER_OK = True
     VADER_OK = False
 # =================== Regex & Flags ===================
 TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
 URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
 QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
 SIG_RE = re.compile(r"\n-- ?\n", re.M)
 SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
 HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
 FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
 FWD_MSG_RE   = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
 ON_WROTE_RE  = re.compile(r'^\s*On .* wrote:$', re.M)
 SKIP_LANGDETECT = True
+# ==== Expanded corruption lexicon ====
 SUSPECT_PHRASES = [
+    # core corruption/finance
+    "off the books","cover up","kickback","bribe","under the table",
+    "no inspection","special fee","friendly payment","confidential deal",
+    "nobody will find out","pay to play","cash only","shell company",
+    "bid rigging","embezzle","slush fund","false invoice","ghost employee",
+    "contract splitting","grease payment","unreported","unrecorded",
+    # secrecy/evasion
+    "off the record","just between us","don’t quote me on this","dont quote me on this",
+    "we never had this conversation","keep this between us","not ethical","illegal",
+    "grey area","gray area","write off","failed investment","they owe it to me",
+    # off-channel comms
+    "let’s take this offline","lets take this offline","send to my gmail","send to my yahoo",
+    "don’t leave a trail","dont leave a trail","call my cell","text me","don’t text me","dont text me",
+    "tell you on the phone","talk in person","come by my office","vpn",
+    # financial secrecy & accounting games
+    "tax haven","off-shore account","offshore account","backdate","pull earnings forward",
+    "delete this email","no inspection","special fees","wire instructions",
 ]
+# Evasive acronyms / slang (case-insensitive)
+EVASIVE_ACRO_RE = re.compile(r'\b(?:TYOP|LDL|TOL|OTR|TXT|TYL)\b', re.I)
+# Entity regexes
 MONEY_RE   = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
 PHONE_RE   = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
+INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order|wire)\b', re.I)
 COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
+ATTACH_NAME_RE = re.compile(r'\b(agreement|contract|invoice|wire|payment|instructions|accounts?|offshore|tax|statement)\b', re.I)
+# Off-channel patterns (apps / phrases)
+OFFCHANNEL_PATTERNS = [
+    r"\bwhatsapp\b", r"\bsignal\b", r"\btelegram\b", r"\bwechat\b",
+    r"send to my (gmail|yahoo|protonmail)", r"(call|text) (me|my cell)",
+    r"take this offline", r"don.?t (text|email) (me|this)",
+    r"\bOTR\b", r"\bTOL\b", r"\bTYOP\b", r"\bLDL\b"
+]
+OFFCHANNEL_RE = re.compile("|".join(OFFCHANNEL_PATTERNS), re.I)
+# Common personal mail domains (used with user-specified trusted org domains)
+PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
 # Optional seeded themes for semi-supervised init (used only when LSA is ON)
 CORR_LEX = {
     "money_flow"    : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
 }
+# =================== Label cleanup helpers (unchanged core) ===================
 EN_STOP = {
     "the","of","and","to","in","is","for","on","at","with","from","by","or","as",
     "that","this","it","be","are","was","were","an","a","you","your","we","our","us",
     "re","fwd","fw","hi","hello","thanks","thank","regards","best","please","dear","mr","mrs",
     "message","original","forwarded","attached","attachment","confidential","notice","disclaimer",
+    "herein","thereof","hereby","therein","regarding","subject","url","via","kind","regard","ny"
 }
+HE_STOP = {"של","על","זה","גם","אם","לא","את","אתה","אני","הוא","היא","הם","הן","כי","מה","שלום","תודה","בברכה","מצורף","הודעה","קדימה","היי"}
 MONTHS = {
     "jan","feb","mar","apr","may","jun","jul","aug","sep","sept","oct","nov","dec",
     "january","february","march","april","june","july","august","september",
     tl = t.lower()
     if tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
         return True
+    if EMAIL_LIKE_RE.search(tl): return True
+    if YEAR_RE.match(tl): return True
+    if NUMERIC_RE.match(tl): return True
+    if ONE_CHAR_RE.match(tl): return True
     return False
 def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
                 break
     return cleaned
+# =================== HTML/Text & Header Parsing ===================
 def html_to_text(html: str) -> str:
     if not html:
         return ""
         return (m.group(1) or "").strip(), (m.group(2) or "").strip()
     return "", s.strip()
+def parse_multi_emails(s: str) -> List[str]:
+    if not s: return []
+    parts = re.split(r",\s*(?=[^,]*@)", s)
+    emails = []
+    for p in parts:
+        _, e = parse_name_email(p.strip())
+        if e: emails.append(e)
+    return emails
 def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
     headers: Dict[str, str] = {}
     lines = (text or "").splitlines()
+    header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject|Subject:|To:|Cc:|Bcc:|From:):')
     i = 0
     saw_header = False
     while i < len(lines):
     if str(raw.get("type", "")).lower() == "meta":
         return {}
+    # attachments (names); accept common schemas
+    attach_names = []
+    atts = raw.get("attachments") or raw.get("Attachments") or raw.get("files") or []
+    if isinstance(atts, list):
+        for a in atts:
+            if isinstance(a, dict):
+                name = a.get("filename") or a.get("name") or ""
+            else:
+                name = str(a)
+            if name:
+                attach_names.append(str(name))
     body_text_raw = raw.get("body_text") or raw.get("text") or ""
     html_content  = raw.get("body_html") or raw.get("html") or ""
     if html_content and not body_text_raw:
         body_text_raw = html_to_text(html_content)
     body_text_raw = ftfy.fix_text(body_text_raw or "")
     subject_text = ""
     from_name = from_email = from_domain = ""
+    to_emails: List[str] = []
     date_val = raw.get("date") or raw.get("Date") or ""
     if body_text_raw:
         subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
         sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
         date_val = headers.get("Date", "") or date_val
+        to_emails = parse_multi_emails(headers.get("To","") or (raw.get("to") or "")) + \
+                    parse_multi_emails(headers.get("Cc","") or (raw.get("cc") or ""))
         body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
         body_clean = URL_RE.sub(" URL ", body_clean)
         sender = raw.get("from") or raw.get("From") or ""
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
+        to_emails = parse_multi_emails(raw.get("to") or "") + parse_multi_emails(raw.get("cc") or "")
     subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
         "from_name": from_name,
         "from_email": from_email,
         "from_domain": from_domain,
+        "to_emails": to_emails,
         "subject": subject_norm,
         "body_text": body_text,
         "lang": lang,
+        "attachments": attach_names,
         "text_hash": text_hash,
     }
     if "wire" in low or "transfer" in low or "cash" in low:
         if "finance" not in tags:
             tags.append("finance")
+    if OFFCHANNEL_RE.search(low) or EVASIVE_ACRO_RE.search(low):
+        tags.append("off-channel")
     return tags
 def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
     df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
     return df
+# Visual highlight helpers
+def _compile_highlight_terms(row: pd.Series, extra_terms: List[str]) -> List[str]:
+    terms = []
+    txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
+    # suspect phrases found in this row
+    for p in SUSPECT_PHRASES:
+        if p in txt:
+            terms.append(p)
+    # entity markers
+    if MONEY_RE.search(txt):   terms.append("$")
+    if INVOICE_RE.search(txt): terms.append("invoice")
+    # regex-based (keep as literal samples)
+    if PHONE_RE.search(row.get("body_text","") or ""): terms.append("phone")
+    # extras from user input
+    for t in extra_terms or []:
+        t=t.strip()
+        if t and t.lower() in txt:
+            terms.append(t)
+    # dedupe
+    out, seen = [], set()
+    for t in terms:
+        if t.lower() not in seen:
+            out.append(t)
+            seen.add(t.lower())
+    return out[:24]
+def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None,
+                           cluster_label: Optional[str] = None,
+                           do_highlight: bool = True,
+                           extra_terms: Optional[List[str]] = None) -> str:
     subject = (row.get("subject") or "").strip()
     body    = (row.get("body_text") or "").strip()
     from_email = row.get("from_email") or ""
     date    = row.get("date") or ""
     tags    = row.get("tags") or []
+    flags   = row.get("flags") or []
     sentiment = row.get("sentiment") or "(unknown)"
+    hl_terms = []
+    if do_highlight:
+        hl_terms = (query_terms or []) + _compile_highlight_terms(row, extra_terms or [])
+        # make unique, case-insensitive
+        seen=set(); uniq=[]
+        for t in hl_terms:
+            tl=t.lower()
+            if tl and tl not in seen:
+                uniq.append(t); seen.add(tl)
+        hl_terms = uniq[:24]
     def hi(text: str) -> str:
+        if not text or not do_highlight or not hl_terms:
             return text
         out = text
+        for qt in hl_terms:
             if not qt:
                 continue
             try:
     dir_attr = ' dir="rtl"' if rtl else ""
     body_html = body_h.replace("\n", "<br/>")
+    def pill(s, cls="tag"):
+        return f'<span class="{cls}">{s}</span>'
     tag_html = ""
+    pills = []
     if isinstance(tags, list) and tags:
+        pills += [pill(t, "tag") for t in tags]
+    if isinstance(flags, list) and flags:
+        pills += [pill(f, "tag") for f in flags]
+    if pills:
+        tag_html = " ".join(pills)
     cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
         self.avgdl_ = None
     def fit(self, X):
         N = X.shape[0]
         df = np.bincount(X.tocsc().indices, minlength=X.shape[1]).astype(np.float64)
         self.idf_ = np.log((N - df + 0.5) / (df + 0.5 + 1e-12))
         dl = np.asarray(X.sum(axis=1)).ravel()
             data[i] = (self.idf_[cols[i]] * (tf * (k1 + 1))) / (denom + 1e-12)
         return X
 def enrich_text(row: pd.Series) -> str:
     subj = row.get("subject","") or ""
     body = row.get("body_text","") or ""
     if PHONE_RE.search(t):   tokens.append("__HAS_PHONE__")
     if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
     if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
+    if OFFCHANNEL_RE.search(t): tokens.append("__OFF_CHANNEL__")
     return (t + " " + " ".join(tokens)).strip()
 # =================== Cluster labeling: PMI bigrams ===================
         labels_out[c] = ", ".join(top) if top else f"cluster_{c}"
     return labels_out
+# =================== Auto-k & merge ===================
 def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
     n = X.shape[0]
     if n > 40000:
     return k_best, dict(zip(ks, inertias))
 def auto_k_rule(n_docs: int) -> int:
     return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
 def merge_close_clusters(labels, centers, thresh=0.92):
     centers = sk_normalize(centers)
     sim = cosine_similarity(centers, centers)
     labels2 = np.array([idmap[root[int(c)]] for c in labels], dtype=int)
     return labels2
 def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVectorizer,
                             lsa_components: np.ndarray, norm_obj: Normalizer,
                             d_word: int, d_full: int, k: int) -> Optional[np.ndarray]:
     seeds_word = []
     vocab = count_vec.vocabulary_
     for _, words in lexicons.items():
             seeds_word.append(v)
     if not seeds_word:
         return None
     seeds_full = []
     for v in seeds_word:
         vf = np.zeros((d_full,), dtype=np.float32)
         vf[:d_word] = v
         seeds_full.append(vf)
+    seeds_full = np.stack(seeds_full, axis=0)
+    seeds_red = seeds_full @ lsa_components.T
     seeds_red = norm_obj.transform(seeds_red.astype(np.float32))
     if seeds_red.shape[0] >= 2 and seeds_red.shape[0] <= k:
         return seeds_red
     return None
+# =================== Scoring & Flags ===================
+def _hour_of(dt_iso: str) -> Optional[int]:
+    try:
+        if not dt_iso: return None
+        dt = pd.to_datetime(dt_iso, utc=True, errors="coerce")
+        if pd.isna(dt): return None
+        # treat UTC for lack of per-user tz; still useful as "odd hour"
+        return int(dt.hour)
+    except Exception:
+        return None
+def _attachment_flags(names: List[str]) -> List[str]:
+    flags=[]
+    for n in names or []:
+        if ATTACH_NAME_RE.search(n):
+            flags.append("📎 " + n[:40])
+    return flags[:5]
+def corruption_score(row, trusted_domains: set):
     score = 0.0
     txt = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
     for ph in SUSPECT_PHRASES:
         if ph in txt:
             score += 2.0
             break
+    if EVASIVE_ACRO_RE.search(txt) or OFFCHANNEL_RE.search(txt):
+        score += 1.0
     if isinstance(row.get("tags"), list) and ("🚩suspect" in row["tags"] or "finance" in row["tags"]):
         score += 1.5
     if MONEY_RE.search(txt):   score += 0.7
     body_len = len(row.get("body_text",""))
     if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
         score += 0.5
+    # personal/off-channel via headers
+    fd = (row.get("from_domain") or "").lower()
+    if fd in PERSONAL_DOMAINS and fd not in trusted_domains:
+        score += 0.5
+    # odd hours
+    h = _hour_of(row.get("date") or "")
+    if h is not None and (h < 6 or h > 22):
+        score += 0.3
     return score
 # =================== Gradio UI ===================
 mark { background:#fff59d; color:#111827; padding:0 2px; border-radius:2px; }
 hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
 .small { color:#475569; font-size:12px; }
+.cursor { cursor:pointer; }
 """
 with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
+    gr.Markdown("# Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans")
     with gr.Row():
         inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
             mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
         with gr.Row():
             use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
+            use_iso = gr.Checkbox(label="Compute anomaly score (IsolationForest on LSA)", value=False)
+    with gr.Accordion("Investigation Controls", open=True):
         with gr.Row():
+            trusted_domains_in = gr.Textbox(label="Trusted org domains (comma-separated)", value="example.gov, example.org")
+            extra_keywords_in  = gr.Textbox(label="Extra suspicious phrases (comma-separated)", value="")
+            highlight_toggle   = gr.Checkbox(label="Highlight suspect patterns in reader", value=True)
         with gr.Row():
+            cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
+            domain_drop  = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
+            sender_drop  = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
+            lang_drop    = gr.Dropdown(label="Language", choices=["(any)"], value="(any)", allow_custom_value=False)
+            sentiment_drop = gr.Dropdown(label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)")
+            tag_drop = gr.Dropdown(label="Tag", choices=["(any)", "🚩suspect", "finance", "off-channel", "odd-hours", "personal-mail"], value="(any)")
         with gr.Row():
             date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
             date_end   = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
+            sort_by    = gr.Dropdown(label="Sort by", choices=["corruption_score","date","anomaly_score","search_score"], value="corruption_score")
+            sort_dir   = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
     with gr.Row():
         run_btn = gr.Button("Process", variant="primary")
     status = gr.Markdown("")
     with gr.Row():
+        cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
         domain_counts_df  = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
+    with gr.Row():
+        actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
+        offhours_df = gr.Dataframe(label="Off-hours & personal-mail hits", interactive=False, wrap=True)
     gr.Markdown("### Search")
     with gr.Row():
         search_query = gr.Textbox(label="Search (keywords, names, etc.)")
     email_view = gr.HTML(label="Reader")
     # State
+    state_df          = gr.State()
+    state_vec         = gr.State()
+    state_X_reduced   = gr.State()
+    state_index       = gr.State()
+    state_term_names  = gr.State()
+    state_query_terms = gr.State()
     state_use_lsa     = gr.State()
     state_use_faiss   = gr.State()
     state_svd         = gr.State()
     state_norm        = gr.State()
+    state_dims        = gr.State()
+    state_extra_terms = gr.State()
+    state_highlight   = gr.State()
     # -------- IO helpers --------
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
         df: pd.DataFrame,
         cluster: Optional[str],
         domain: Optional[str],
+        sender: Optional[str],
+        lang_value: str,
         sentiment: str,
         tag_value: str,
         start: str,
                 out = out[out["cluster_id"] == cid]
         if domain and domain != "(any)":
             out = out[out["from_domain"] == domain]
+        if sender and sender != "(any)":
+            out = out[out["from_email"] == sender]
+        if lang_value and lang_value != "(any)":
+            out = out[out["lang"] == lang_value]
         if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
             out = out[out["sentiment"].astype(str) == sentiment]
         if tag_value and tag_value != "(any)":
+            out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))
+                      | out["flags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
         if start:
             try:
                 dt = pd.to_datetime(start, utc=True, errors="coerce")
                 pass
         return out
+    # -------- Simple social network stats --------
+    def social_stats(df: pd.DataFrame) -> pd.DataFrame:
+        # degree = unique counterparts per address (from <-> each to/cc)
+        deg = {}
+        def add_edge(a,b):
+            if not a or not b or a==b: return
+            deg.setdefault(a,set()).add(b)
+            deg.setdefault(b,set()).add(a)
+        for _, r in df.iterrows():
+            f = r.get("from_email") or ""
+            tos = r.get("to_emails") or []
+            for t in tos:
+                add_edge(f, t)
+        rows=[]
+        for addr, nbrs in deg.items():
+            rows.append({"address": addr, "degree": len(nbrs)})
+        out = pd.DataFrame(rows).sort_values("degree", ascending=False).head(50)
+        return out
     # -------- Main pipeline --------
     def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
+                     use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
+                     trusted_domains_in, extra_keywords_in, highlight_toggle):
         if inbox_file is None:
             return ("**Please upload a file.**",
+                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
+                    None, None, None, None)
         use_lang = not bool(skip_lang)
+        # trusted org domains
+        trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
+        extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
+        # extend SUSPECT_PHRASES runtime (no mutation of constant list)
+        extra_terms_lower = [t.lower() for t in extra_terms]
         recs = _load_json_records(inbox_file.name)
         if not recs:
             return ("**No valid records found.**",
+                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
+                    None, None, None, None)
         # Normalize
         normd = []
         df = pd.DataFrame(normd)
         if df.empty:
             return ("**No usable email records after normalization.**",
+                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
+                    None, None, None, None)
         # Deduplicate conservatively
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
+        # Tags + Sentiment
         df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
         df = compute_sentiment_column(df)
+        # Row-level flags (off-hours, personal-mail, attachments)
+        flags = []
+        for _, row in df.iterrows():
+            f = []
+            h = _hour_of(row.get("date") or "")
+            if h is not None and (h < 6 or h > 22):
+                f.append("odd-hours")
+            fd = (row.get("from_domain") or "").lower()
+            if (fd in PERSONAL_DOMAINS) and (fd not in trusted):
+                f.append("personal-mail")
+            # attachment names of interest
+            f += _attachment_flags(row.get("attachments") or [])
+            flags.append(f)
+        df["flags"] = flags
         # Enriched texts (adds __HAS_*__ flags)
         texts = list(df.apply(enrich_text, axis=1))
+        # === Vectorization ===
         ngram_range = (1, 2) if use_bigrams else (1, 1)
         count_vec = CountVectorizer(
             analyzer="word",
         d_char = X_char.shape[1]
         d_full = X_full.shape[1]
+        # LSA
         use_lsa = bool(use_lsa)
         X_reduced = None
         svd_obj = None
         norm_obj = None
         if use_lsa:
             svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
+            X_reduced_tmp = svd_obj.fit_transform(X_full)  # dense
             norm_obj = Normalizer(copy=False)
             X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
             del X_reduced_tmp
             gc.collect()
+        # Optional anomaly detection (on LSA space)
+        anomaly_scores = np.full((len(df),), np.nan, dtype=np.float32)
+        if use_lsa and bool(use_iso) and ISO_OK and X_reduced is not None and X_reduced.shape[0] >= 50:
+            try:
+                iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
+                iso.fit(X_reduced)
+                # higher is less anomalous; convert to anomaly score = -score
+                anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32)
+            except Exception:
+                pass
+        df["anomaly_score"] = anomaly_scores
         # K selection
         if bool(auto_k):
             if use_lsa:
                 k, _ = choose_k_by_kneedle(X_reduced, ks=(50,100,150,200,300,400,500))
             else:
                 k = auto_k_rule(X_full.shape[0])
         else:
             k = max(10, int(k_clusters or 350))
+        # Optional seeded init (only in LSA space)
         init = None
         if use_lsa:
             seeds = seeded_centroids_in_lsa(
                 CORR_LEX, count_vec, svd_obj.components_, norm_obj,
                 d_word=d_word, d_full=d_full, k=k
             )
+            if seeds is not None and seeds.shape[0] == k:
+                init = seeds
         # KMeans clustering (use LSA space if enabled)
         X_space = (X_reduced if use_lsa else X_full)
         )
         labels = kmeans.fit_predict(X_space)
+        # Merge very-similar clusters (LSA only)
         if use_lsa:
             labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.92)
         df["cluster_id"] = labels
+        # Cluster names
         term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
         df["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
+        # CorruptionScore (now uses trusted domains)
+        df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
         # Build search index
         use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
         )
         domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
+        sender_counts = (
+            df.groupby("from_email").size()
+              .reset_index(name="count")
+              .sort_values("count", ascending=False)
+              .head(200)
+        )
+        sender_choices = ["(any)"] + sender_counts["from_email"].tolist()
+        # Languages present
+        langs = [l for l in sorted(df["lang"].dropna().unique()) if l and l!="unknown"]
+        lang_choices = ["(any)"] + langs
+        # Social stats
+        actors = social_stats(df)
+        # Off-hours & personal mail table
+        offp = df[(df["flags"].apply(lambda xs: "odd-hours" in (xs or []))) | (df["flags"].apply(lambda xs: "personal-mail" in (xs or [])))]
+        offhours_table = offp[["date","from_email","from_domain","subject","flags","corruption_score"]].sort_values("corruption_score", ascending=False).head(200)
+        # Default results (sorted)
         show_df = df.copy()
         if "date" in show_df.columns and show_df["date"].notna().any():
             show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
         else:
             show_df["_dt"] = pd.NaT
         show_df = show_df.sort_values(["corruption_score","_dt"], ascending=[False, False]).drop(columns=["_dt"])
+        cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
         out_table = show_df[cols_out].head(500)
         vec_state = {"count_vec": count_vec, "char_vec": char_vec, "bm25": bm25}
             f"**Processed {len(df):,} emails**  \n"
             f"Word feats (BM25): {d_word:,}  |  Char feats: {d_char:,}  |  Total: {d_full:,}  \n"
             f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims  |  ' if use_lsa else ''}"
+            f"k = {k}  |  Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'}  |  "
+            f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
         )
         gc.collect()
         cluster_update = gr.update(choices=cluster_choices, value="(any)")
         domain_update  = gr.update(choices=domain_choices,  value="(any)")
+        sender_update  = gr.update(choices=sender_choices,  value="(any)")
+        lang_update    = gr.update(choices=lang_choices,    value="(any)")
         return (
             status_md,
             cluster_counts, domain_counts,
+            actors, offhours_table,
             out_table,
             df, vec_state, (X_reduced if use_lsa else None), index_obj, term_names,
             use_lsa, bool(use_faiss),
+            cluster_update, domain_update, sender_update, lang_update,
             svd_obj, norm_obj,
+            (d_word, d_char),
+            extra_terms_lower, bool(highlight_toggle)
         )
     (run_btn.click)(
         process_file,
         inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
+                use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
+                trusted_domains_in, extra_keywords_in, highlight_toggle],
         outputs=[status,
                  cluster_counts_df, domain_counts_df,
+                 actors_df, offhours_df,
                  results_df,
                  state_df, state_vec, state_X_reduced, state_index, state_term_names,
                  state_use_lsa, state_use_faiss,
+                 cluster_drop, domain_drop, sender_drop, lang_drop,
                  state_svd, state_norm,
+                 state_dims,
+                 state_extra_terms, state_highlight]
     )
     # -------- Filtering & Search --------
+    def _sort_results(df, by, direction):
+        if df is None or len(df)==0:
             return pd.DataFrame()
+        tmp = df.copy()
+        if "date" in tmp.columns:
             tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
+        else:
+            tmp["_dt"] = pd.NaT
+        by = by or "corruption_score"
+        asc = (direction == "asc")
+        if by == "date":
+            tmp = tmp.sort_values(["_dt"], ascending=asc)
+        elif by == "anomaly_score" and "anomaly_score" in tmp.columns:
+            tmp = tmp.sort_values(["anomaly_score","_dt"], ascending=[asc, not asc])
+        else:
+            # corruption_score or search_score (if present)
+            col = by if by in tmp.columns else "corruption_score"
+            tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
+        tmp = tmp.drop(columns=["_dt"])
+        cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
+        acc = [c for c in cols_out if c in tmp.columns]
+        return tmp[acc].head(500)
+    def refresh_results(df, cluster_choice, domain_choice, sender_choice, lang_choice, sentiment_choice, tag_choice, start, end, sort_by, sort_dir):
+        if df is None or len(df) == 0:
+            return pd.DataFrame()
+        filt = _apply_filters(df, cluster_choice, domain_choice, sender_choice, lang_choice, sentiment_choice, tag_choice, start, end)
+        return _sort_results(filt, sort_by, sort_dir)
+    for ctrl in [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir]:
         ctrl.change(
             refresh_results,
+            inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
             outputs=[results_df]
         )
     reset_btn.click(
+        lambda: ["(any)", "(any)", "(any)", "(any)", "(any)", "(any)", "", "", "corruption_score", "desc"],
         inputs=[],
+        outputs=[cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir]
     ).then(
         refresh_results,
+        inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
         outputs=[results_df]
     )
         q_full    = hstack([q_word, q_char], format="csr")
         return q_full
+    def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
         if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
             return pd.DataFrame(), []
         q_terms = _tokenize_query(q)
             inds = indices[0]
             sims = 1.0 - distances[0]
             results = df.iloc[inds].copy()
+            results["search_score"] = sims
         elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
             D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df)))
             inds = I[0]
             sims = D[0]
             results = df.iloc[inds].copy()
+            results["search_score"] = sims
         else:
             return pd.DataFrame(), q_terms
+        # blend with corruption score lightly
         cs = results["corruption_score"].fillna(0.0)
         cs = (cs - cs.min()) / (cs.max() - cs.min() + 1e-9)
+        results["_blend"] = 0.7*results["search_score"].values + 0.3*cs.values
+        # sort UI-selected way
+        if sort_by == "search_score":
+            results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
+        else:
+            # use blended but keep sort_by if chosen
+            if sort_by in results.columns:
+                results = results.sort_values([sort_by,"_blend"], ascending=[(sort_dir=="asc"), False])
+            else:
+                results = results.sort_values("_blend", ascending=(sort_dir=="asc"))
+        results = results.drop(columns=["_blend"])
+        cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score", "search_score"]
         return results[cols].head(50), q_terms
     search_btn.click(
         search_fn,
+        inputs=[search_query, state_df, state_vec, state_X_reduced, state_index, state_use_lsa, state_use_faiss, state_svd, state_norm, sort_by, sort_dir],
         outputs=[results_df, state_query_terms]
     )
+    def on_row_select(evt: gr.SelectData, table: pd.DataFrame, df: pd.DataFrame, term_names: Dict[int, str],
+                      query_terms: Optional[List[str]], extra_terms: List[str], do_highlight: bool):
         try:
             row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
         except Exception:
         row = cand.iloc[0]
         cid = int(row.get("cluster_id", -1))
         clabel = term_names.get(cid, f"cluster_{cid}") if term_names else None
+        return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel, do_highlight=bool(do_highlight), extra_terms=extra_terms)
     results_df.select(
         on_row_select,
+        inputs=[results_df, state_df, state_term_names, state_query_terms, state_extra_terms, state_highlight],
         outputs=[email_view]
     )
+    # Click cluster summary to filter
+    def on_cluster_click(evt: gr.SelectData, df_sum: pd.DataFrame):
+        try:
+            row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
+        except Exception:
+            row_idx = evt.index if hasattr(evt, "index") else None
+        if row_idx is None or df_sum is None or len(df_sum)==0:
+            return "(any)"
+        label = df_sum.iloc[row_idx]["label"]
+        return label if isinstance(label, str) else "(any)"
+    cluster_counts_df.select(
+        on_cluster_click,
+        inputs=[cluster_counts_df],
+        outputs=[cluster_drop]
+    ).then(
+        refresh_results,
+        inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
+        outputs=[results_df]
+    )
 if __name__ == "__main__":
     demo.launch()