Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Aug 31, 2025

Commit

d5cce46

verified ·

1 Parent(s): 7a250cb

Update app.py

Browse files

Files changed (1) hide show

app.py +307 -111

app.py CHANGED Viewed

@@ -17,11 +17,15 @@ import gradio as gr
 from tqdm import tqdm
 # sklearn (CPU-friendly)
-from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.neighbors import NearestNeighbors
 from sklearn.decomposition import TruncatedSVD
 from sklearn.preprocessing import Normalizer
 # Optional fast ANN (CPU)
 try:
@@ -71,6 +75,20 @@ SUSPECT_PHRASES = [
     "contract splitting", "grease payment", "unreported", "unrecorded",
 ]
 # =================== Label cleanup helpers ===================
 EN_STOP = {
     "the","of","and","to","in","is","for","on","at","with","from","by","or","as",
@@ -78,7 +96,7 @@ EN_STOP = {
     "re","fwd","fw","hi","hello","thanks","thank","regards","best","please","dear","mr","mrs",
     "message","original","forwarded","attached","attachment","confidential","notice","disclaimer",
     "herein","thereof","hereby","therein","regarding","subject","url","via","kind","regard",
-    "ny"  # short common noise in your set
 }
 HE_STOP = {
     "של","על","זה","גם","אם","לא","את","אתה","אני","הוא","היא","הם","הן","כי","מה",
@@ -109,7 +127,6 @@ def _is_junk_term(t: str) -> bool:
     return False
 def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
-    # Keep order by descending weight in idxs
     ordered = idxs[np.argsort(-mean_vec[idxs])]
     cleaned = []
     for i in ordered:
@@ -119,7 +136,6 @@ def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarra
         cleaned.append(term)
         if len(cleaned) >= want:
             break
-    # If we filtered too hard, allow some not-too-bad tokens (but still avoid email-like)
     if len(cleaned) < max(2, want//2):
         for i in ordered:
             term = names[i]
@@ -141,19 +157,14 @@ def html_to_text(html: str) -> str:
     return soup.get_text(separator="\n")
 def strip_quotes_and_sigs(text: str) -> str:
-    """Drop quoted lines, signatures, device footers, forwarded chains."""
     if not text:
         return ""
-    # remove > quoted lines
     text = QUOTE_LINE_RE.sub("", text)
-    # cut everything after signature separator
     parts = SIG_RE.split(text)
     if parts:
         text = parts[0]
-    # remove device footers
     text = SENT_FROM_RE.sub("", text)
     text = HEBREW_SENT_FROM_RE.sub("", text)
-    # trim forwarded/quoted chains
     cut = None
     for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
         m = pat.search(text)
@@ -165,7 +176,6 @@ def strip_quotes_and_sigs(text: str) -> str:
     return text.strip()
 def parse_name_email(s: str) -> Tuple[str, str]:
-    """Split 'Name <email>' into (name, email)."""
     if not s:
         return "", ""
     m = re.match(r'(?:"?([^"]*)"?\s)?<?([^<>]+@[^<>]+)>?', s)
@@ -174,16 +184,11 @@ def parse_name_email(s: str) -> Tuple[str, str]:
     return "", s.strip()
 def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
-    """
-    Extract inline headers (From, To, CC, Date, Subject) from the text blob.
-    Returns (headers_dict, remaining_body_text).
-    """
     headers: Dict[str, str] = {}
     lines = (text or "").splitlines()
     header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject):')
     i = 0
     saw_header = False
     while i < len(lines):
         line = lines[i].rstrip("\r")
         stripped = line.strip()
@@ -221,13 +226,11 @@ def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
                 break
             else:
                 break
     body_text = "\n".join(lines[i:]) if i < len(lines) else ""
     return headers, body_text
 # =================== Normalization & Utilities ===================
 def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[str, Any]:
-    """Normalize a single raw record into a structured row."""
     if str(raw.get("type", "")).lower() == "meta":
         return {}
@@ -248,7 +251,6 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
         sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
         date_val = headers.get("Date", "") or date_val
-        # Clean body
         body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
         body_clean = URL_RE.sub(" URL ", body_clean)
         body_clean = re.sub(r"\s+", " ", body_clean).strip()
@@ -307,7 +309,6 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
     }
 def has_suspect_tag(text: str) -> List[str]:
-    """Return list of corruption/suspicion tags present in text."""
     tags = []
     if not text:
         return tags
@@ -330,7 +331,6 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
         return df
     analyzer = SentimentIntensityAnalyzer()
     scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
-    # VADER thresholds: [-1,-0.05), (-0.05,0.05), (0.05,1]
     bins = [-1.01, -0.05, 0.05, 1.01]
     labels = ["negative", "neutral", "positive"]
     df["sentiment_score"] = scores
@@ -338,7 +338,6 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
     return df
 def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str] = None) -> str:
-    """Email reader HTML with highlighted query terms and visible tags."""
     subject = (row.get("subject") or "").strip()
     body    = (row.get("body_text") or "").strip()
     from_email = row.get("from_email") or ""
@@ -363,7 +362,6 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
     subject_h = hi(subject)
     body_h    = hi(body)
-    # Basic RTL detection for Hebrew/Arabic chars → add dir="rtl"
     rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
     dir_attr = ' dir="rtl"' if rtl else ""
     body_html = body_h.replace("\n", "<br/>")
@@ -394,31 +392,185 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
     )
     return html
-def top_terms_per_cluster(X, labels, vectorizer, topn=6):
-    names = vectorizer.get_feature_names_out()
-    out = {}
-    uniq = np.unique(labels)
-    for c in uniq:
-        mask = (labels == c)
-        if mask.sum() == 0:
-            out[int(c)] = f"cluster_{c}"
-            continue
-        # mean TF-IDF per feature inside cluster
-        mean_vec = X[mask].mean(axis=0).A1
-        if mean_vec.size == 0:
-            out[int(c)] = f"cluster_{c}"
-            continue
-        # oversample candidates, then filter junk
-        take = max(topn * 4, topn)
-        idx = np.argpartition(mean_vec, -take)[-take:]
-        terms = _sanitize_top_terms(names, idx, mean_vec, want=topn)
-        out[int(c)] = ", ".join(terms) if terms else f"cluster_{c}"
-    return out
 def auto_k_rule(n_docs: int) -> int:
     # Sublinear scaling; keeps clusters between ~120 and 600 for big corpora
     return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
 # =================== Gradio UI ===================
 CSS = """
 :root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
@@ -439,8 +591,8 @@ hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
 with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
     gr.Markdown("""
-    # Email Investigator — TF-IDF + LSA + MiniBatchKMeans
-    **Goal:** quickly surface potentially corruption-related emails via topic clusters, tags, and sentiment.
     """)
     with gr.Row():
@@ -448,7 +600,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     with gr.Accordion("Vectorization & Clustering", open=True):
         with gr.Row():
-            max_features = gr.Number(label="TF-IDF max_features", value=120_000, precision=0)
             min_df = gr.Number(label="min_df (doc freq ≥)", value=2, precision=0)
             max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
             use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
@@ -456,11 +608,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         with gr.Row():
             use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
             lsa_dim = gr.Number(label="LSA components", value=150, precision=0)
-            auto_k = gr.Checkbox(label="Auto choose k", value=True)
             k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
             mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
         with gr.Row():
-            use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available)", value=True)
     with gr.Accordion("Filters", open=True):
         with gr.Row():
@@ -495,7 +647,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     # State
     state_df          = gr.State()          # full dataframe
-    state_vec         = gr.State()          # TfidfVectorizer
     state_X_reduced   = gr.State()          # np.ndarray (LSA normalized) or None
     state_index       = gr.State()          # Faiss index or sklearn NN
     state_term_names  = gr.State()          # dict cluster_id -> label
@@ -504,6 +656,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     state_use_faiss   = gr.State()
     state_svd         = gr.State()
     state_norm        = gr.State()
     # -------- IO helpers --------
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
@@ -545,7 +698,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     ) -> pd.DataFrame:
         out = df
         if cluster and cluster != "(any)":
-            # cluster values like "12 — payment, contract (534)"
             m = re.match(r"^(\d+)\s+—", cluster)
             if m:
                 cid = int(m.group(1))
@@ -555,9 +707,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
             out = out[out["sentiment"].astype(str) == sentiment]
         if tag_value and tag_value != "(any)":
-            # tags is a list; check membership robustly
             out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
-        # date bounds
         if start:
             try:
                 dt = pd.to_datetime(start, utc=True, errors="coerce")
@@ -577,14 +727,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                      use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss):
         if inbox_file is None:
             return ("**Please upload a file.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None, None, None)
         use_lang = not bool(skip_lang)
         recs = _load_json_records(inbox_file.name)
         if not recs:
             return ("**No valid records found.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None, None, None)
         # Normalize
         normd = []
@@ -595,7 +745,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         df = pd.DataFrame(normd)
         if df.empty:
             return ("**No usable email records after normalization.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None, None, None)
         # Deduplicate conservatively
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
@@ -604,12 +754,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
         df = compute_sentiment_column(df)
-        # Texts for modeling
-        texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
-        # TF-IDF (sparse CSR float32)
         ngram_range = (1, 2) if use_bigrams else (1, 1)
-        vec = TfidfVectorizer(
             analyzer="word",
             ngram_range=ngram_range,
             max_features=int(max_features) if max_features else None,
@@ -617,10 +767,22 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             max_df=float(max_df) if max_df else 0.7,
             token_pattern=TOKEN_PATTERN,
             lowercase=True,
-            sublinear_tf=True,
             dtype=np.float32,
         )
-        X = vec.fit_transform(texts)  # CSR float32
         # LSA (TruncatedSVD + Normalizer) for stability/quality
         use_lsa = bool(use_lsa)
@@ -629,43 +791,70 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         norm_obj = None
         if use_lsa:
             svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
-            X_reduced_tmp = svd_obj.fit_transform(X)  # dense (n_docs x lsa_dim)
             norm_obj = Normalizer(copy=False)
             X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
             del X_reduced_tmp
             gc.collect()
-        # KMeans clustering
         if bool(auto_k):
-            k = auto_k_rule(X.shape[0])
         else:
             k = max(10, int(k_clusters or 350))
         kmeans = MiniBatchKMeans(
             n_clusters=k,
             batch_size=int(mb_batch or 4096),
             random_state=0,
-            n_init="auto",
         )
-        labels = kmeans.fit_predict(X_reduced if use_lsa else X)
         df["cluster_id"] = labels
-        # Name clusters by top terms (use original TF-IDF for interpretability)
-        term_names = top_terms_per_cluster(X, labels, vec, topn=6)
-        df["cluster_name"] = [term_names[int(c)] for c in labels]
         # Build search index
-        use_faiss = bool(use_faiss) and FAISS_OK
         index_obj = None
-        if use_faiss and use_lsa:
-            # cosine ≈ inner product on normalized vectors
             d = X_reduced.shape[1]
-            index_obj = faiss.IndexFlatIP(d)
             index_obj.add(X_reduced)
         else:
-            # fallback to brute-force cosine on TF-IDF or reduced vectors
             nn = NearestNeighbors(metric="cosine", algorithm="brute")
-            nn.fit(X_reduced if use_lsa else X)
             index_obj = nn
         # Summaries
@@ -675,7 +864,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
               .sort_values("count", ascending=False)
               .head(500)
         )
-        # For dropdown labels: "id — label (count)"
         cluster_counts["label"] = cluster_counts.apply(
             lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
         )
@@ -689,28 +877,28 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         )
         domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
-        # Results preview default (latest 500 by date if available)
-        if "date" in df.columns and df["date"].notna().any():
-            show_df = df.copy()
-            # coerce to datetime for sort
             show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
-            show_df = show_df.sort_values("_dt", ascending=False).drop(columns=["_dt"])
         else:
-            show_df = df.copy()
-        cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
         out_table = show_df[cols_out].head(500)
         status_md = (
             f"**Processed {len(df):,} emails**  \n"
-            f"TF-IDF shape = {X.shape[0]:,} × {X.shape[1]:,}  |  "
             f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims  |  ' if use_lsa else ''}"
-            f"k = {k}  |  Search = {'Faiss (IP on LSA)' if (use_faiss and use_lsa and FAISS_OK) else 'cosine brute-force'}"
         )
         gc.collect()
-        # Use gr.update to set dropdown choices + default values safely
         cluster_update = gr.update(choices=cluster_choices, value="(any)")
         domain_update  = gr.update(choices=domain_choices,  value="(any)")
@@ -718,10 +906,11 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             status_md,
             cluster_counts, domain_counts,
             out_table,
-            df, vec, (X_reduced if use_lsa else None), index_obj, term_names,
-            use_lsa, (use_faiss and use_lsa and FAISS_OK),
             cluster_update, domain_update,
-            svd_obj, norm_obj
         )
     (run_btn.click)(
@@ -734,7 +923,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                  state_df, state_vec, state_X_reduced, state_index, state_term_names,
                  state_use_lsa, state_use_faiss,
                  cluster_drop, domain_drop,
-                 state_svd, state_norm]
     )
     # -------- Filtering & Search --------
@@ -742,14 +932,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         if df is None or len(df) == 0:
             return pd.DataFrame()
         filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
-        cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
-        # default: sort by date desc if possible
         if "date" in filt.columns and filt["date"].notna().any():
             tmp = filt.copy()
             tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
-            tmp = tmp.sort_values("_dt", ascending=False).drop(columns=["_dt"])
             return tmp[cols_out].head(500)
-        return filt[cols_out].head(500)
     for ctrl in [cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]:
         ctrl.change(
@@ -758,7 +947,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             outputs=[results_df]
         )
-    # Safer reset: set dropdowns to None (always valid), others to defaults
     reset_btn.click(
         lambda: [None, None, "(any)", "(any)", "", ""],
         inputs=[],
@@ -772,38 +960,43 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     def _tokenize_query(q: str) -> List[str]:
         if not q:
             return []
-        # split on spaces, keep simple tokens; dedupe while preserving order
         parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
         seen, out = set(), []
         for p in parts:
             if p.lower() not in seen:
                 out.append(p)
                 seen.add(p.lower())
-        return out[:8]  # limit highlights for performance
     def _project_query_to_lsa(q_vec, svd_obj, norm_obj) -> Optional[np.ndarray]:
         try:
-            q_red = svd_obj.transform(q_vec)   # (1, lsa_dim)
-            q_red = norm_obj.transform(q_red)  # normalize
             return q_red.astype(np.float32)
         except Exception:
             return None
-    def search_fn(q, df, vec, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj):
-        if (not q) or (df is None) or (vec is None) or (index_obj is None):
             return pd.DataFrame(), []
         q_terms = _tokenize_query(q)
-        # Vectorize the query
-        q_vec = vec.transform([q])
-        # Decide which space the index uses and project accordingly
         if use_lsa_flag and (X_reduced is not None):
-            q_emb = _project_query_to_lsa(q_vec, svd_obj, norm_obj)
             if q_emb is None:
                 return pd.DataFrame(), q_terms
         else:
-            q_emb = q_vec
         if isinstance(index_obj, NearestNeighbors):
             distances, indices = index_obj.kneighbors(q_emb, n_neighbors=min(50, len(df)))
@@ -811,7 +1004,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             sims = 1.0 - distances[0]
             results = df.iloc[inds].copy()
             results["score"] = sims
-        elif FAISS_OK and isinstance(index_obj, faiss.Index):
             D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df)))
             inds = I[0]
             sims = D[0]
@@ -820,7 +1013,12 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         else:
             return pd.DataFrame(), q_terms
-        cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment", "score"]
         return results[cols].head(50), q_terms
     search_btn.click(
@@ -836,12 +1034,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             row_idx = evt.index if hasattr(evt, "index") else None
         if row_idx is None or table is None or len(table) == 0 or df is None or len(df) == 0:
             return ""
-        # Get identifying columns from the table row to map back to original df row
         sel = table.iloc[row_idx]
         subj = sel.get("subject", None)
         frm  = sel.get("from_email", None)
         dstr = sel.get("date", None)
-        # match in original df
         cand = df
         if subj is not None:
             cand = cand[cand["subject"] == subj]

 from tqdm import tqdm
 # sklearn (CPU-friendly)
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer as CharTfidf
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.neighbors import NearestNeighbors
 from sklearn.decomposition import TruncatedSVD
 from sklearn.preprocessing import Normalizer
+from sklearn.preprocessing import normalize as sk_normalize
+from sklearn.metrics.pairwise import cosine_similarity
+from scipy.sparse import hstack
 # Optional fast ANN (CPU)
 try:
     "contract splitting", "grease payment", "unreported", "unrecorded",
 ]
+# Entity regexes for enrichment/scoring
+MONEY_RE   = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
+PHONE_RE   = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
+INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order)\b', re.I)
+COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
+# Optional seeded themes for semi-supervised init (used only when LSA is ON)
+CORR_LEX = {
+    "kickback"      : ["kickback","bribe","under the table","gift","cash"],
+    "invoice_fraud" : ["false invoice","ghost employee","contract splitting","slush fund","shell company","front company"],
+    "procurement"   : ["bid rigging","tender","vendor","sole source","rfp","rfq","purchase order","po"],
+    "money_flow"    : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
+}
 # =================== Label cleanup helpers ===================
 EN_STOP = {
     "the","of","and","to","in","is","for","on","at","with","from","by","or","as",
     "re","fwd","fw","hi","hello","thanks","thank","regards","best","please","dear","mr","mrs",
     "message","original","forwarded","attached","attachment","confidential","notice","disclaimer",
     "herein","thereof","hereby","therein","regarding","subject","url","via","kind","regard",
+    "ny"
 }
 HE_STOP = {
     "של","על","זה","גם","אם","לא","את","אתה","אני","הוא","היא","הם","הן","כי","מה",
     return False
 def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
     ordered = idxs[np.argsort(-mean_vec[idxs])]
     cleaned = []
     for i in ordered:
         cleaned.append(term)
         if len(cleaned) >= want:
             break
     if len(cleaned) < max(2, want//2):
         for i in ordered:
             term = names[i]
     return soup.get_text(separator="\n")
 def strip_quotes_and_sigs(text: str) -> str:
     if not text:
         return ""
     text = QUOTE_LINE_RE.sub("", text)
     parts = SIG_RE.split(text)
     if parts:
         text = parts[0]
     text = SENT_FROM_RE.sub("", text)
     text = HEBREW_SENT_FROM_RE.sub("", text)
     cut = None
     for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
         m = pat.search(text)
     return text.strip()
 def parse_name_email(s: str) -> Tuple[str, str]:
     if not s:
         return "", ""
     m = re.match(r'(?:"?([^"]*)"?\s)?<?([^<>]+@[^<>]+)>?', s)
     return "", s.strip()
 def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
     headers: Dict[str, str] = {}
     lines = (text or "").splitlines()
     header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject):')
     i = 0
     saw_header = False
     while i < len(lines):
         line = lines[i].rstrip("\r")
         stripped = line.strip()
                 break
             else:
                 break
     body_text = "\n".join(lines[i:]) if i < len(lines) else ""
     return headers, body_text
 # =================== Normalization & Utilities ===================
 def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[str, Any]:
     if str(raw.get("type", "")).lower() == "meta":
         return {}
         sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
         date_val = headers.get("Date", "") or date_val
         body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
         body_clean = URL_RE.sub(" URL ", body_clean)
         body_clean = re.sub(r"\s+", " ", body_clean).strip()
     }
 def has_suspect_tag(text: str) -> List[str]:
     tags = []
     if not text:
         return tags
         return df
     analyzer = SentimentIntensityAnalyzer()
     scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
     bins = [-1.01, -0.05, 0.05, 1.01]
     labels = ["negative", "neutral", "positive"]
     df["sentiment_score"] = scores
     return df
 def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str] = None) -> str:
     subject = (row.get("subject") or "").strip()
     body    = (row.get("body_text") or "").strip()
     from_email = row.get("from_email") or ""
     subject_h = hi(subject)
     body_h    = hi(body)
     rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
     dir_attr = ' dir="rtl"' if rtl else ""
     body_html = body_h.replace("\n", "<br/>")
     )
     return html
+# =================== Feature engineering (BM25 + char) ===================
+class BM25Transformer:
+    def __init__(self, k1=1.2, b=0.75):
+        self.k1 = k1
+        self.b  = b
+        self.idf_ = None
+        self.avgdl_ = None
+    def fit(self, X):
+        # X is term-frequency (CountVectorizer)
+        N = X.shape[0]
+        # document frequency per term
+        df = np.bincount(X.tocsc().indices, minlength=X.shape[1]).astype(np.float64)
+        self.idf_ = np.log((N - df + 0.5) / (df + 0.5 + 1e-12))
+        dl = np.asarray(X.sum(axis=1)).ravel()
+        self.avgdl_ = float(dl.mean() if dl.size else 1.0)
+        return self
+    def transform(self, X):
+        X = X.tocsr(copy=True).astype(np.float32)
+        dl = np.asarray(X.sum(axis=1)).ravel()
+        k1, b, avgdl = self.k1, self.b, self.avgdl_
+        rows, cols = X.nonzero()
+        data = X.data
+        for i in range(len(data)):
+            tf = data[i]
+            d  = rows[i]
+            denom = tf + k1 * (1 - b + b * (dl[d] / (avgdl + 1e-12)))
+            data[i] = (self.idf_[cols[i]] * (tf * (k1 + 1))) / (denom + 1e-12)
+        return X
+# Add enrichment tokens to help the model lock onto key signals
+def enrich_text(row: pd.Series) -> str:
+    subj = row.get("subject","") or ""
+    body = row.get("body_text","") or ""
+    t = subj + "\n\n" + body
+    tokens = []
+    if MONEY_RE.search(t):   tokens.append("__HAS_MONEY__")
+    if PHONE_RE.search(t):   tokens.append("__HAS_PHONE__")
+    if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
+    if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
+    return (t + " " + " ".join(tokens)).strip()
+# =================== Cluster labeling: PMI bigrams ===================
+def cluster_labels_pmi_bigram(texts, labels, topn=6):
+    def bigrams(t):
+        toks = re.findall(TOKEN_PATTERN, t.lower())
+        return [" ".join(p) for p in zip(toks, toks[1:])]
+    N = len(texts)
+    from collections import Counter
+    import math as _math
+    glob_bg = Counter()
+    per_c   = {int(c): Counter() for c in np.unique(labels)}
+    for t, c in zip(texts, labels):
+        bgs = set(bigrams(t))
+        glob_bg.update(bgs)
+        per_c[int(c)].update(bgs)
+    labels_out = {}
+    total_bg = sum(glob_bg.values()) + 1e-9
+    for c in np.unique(labels):
+        c = int(c)
+        scores = []
+        total_c = sum(per_c[c].values()) + 1e-9
+        for bg, cnt in per_c[c].most_common(1000):
+            p_bg_c = cnt / total_c
+            p_bg   = (glob_bg[bg] / total_bg)
+            if p_bg > 0 and p_bg_c > 0:
+                score = _math.log(p_bg_c) - _math.log(p_bg)
+                scores.append((score, bg))
+        scores.sort(reverse=True)
+        top = [bg for _, bg in scores[:topn]]
+        labels_out[c] = ", ".join(top) if top else f"cluster_{c}"
+    return labels_out
+# =================== Auto-k (Kneedle on inertia) ===================
+def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
+    n = X.shape[0]
+    if n > 40000:
+        rs = np.random.RandomState(0)
+        idx = rs.choice(n, size=40000, replace=False)
+        Xs = X[idx]
+    else:
+        Xs = X
+    inertias = []
+    for k in ks:
+        km = MiniBatchKMeans(n_clusters=k, batch_size=4096, random_state=0, n_init="auto")
+        km.fit(Xs)
+        inertias.append(km.inertia_)
+    x = np.array(list(ks), dtype=float)
+    y = np.array(inertias, dtype=float)
+    y_norm = (y - y.min()) / (y.max() - y.min() + 1e-9)
+    x_norm = (x - x.min()) / (x.max() - x.min() + 1e-9)
+    chord = y_norm[0] + (y_norm[-1] - y_norm[0]) * (x_norm - x_norm[0])/(x_norm[-1]-x_norm[0]+1e-9)
+    dist = chord - y_norm
+    k_best = int(x[np.argmax(dist)])
+    return k_best, dict(zip(ks, inertias))
 def auto_k_rule(n_docs: int) -> int:
     # Sublinear scaling; keeps clusters between ~120 and 600 for big corpora
     return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
+# =================== Merge close clusters (LSA space only to save RAM) ===================
+def merge_close_clusters(labels, centers, thresh=0.92):
+    centers = sk_normalize(centers)
+    sim = cosine_similarity(centers, centers)
+    k = centers.shape[0]
+    parent = list(range(k))
+    def find(a):
+        while parent[a]!=a: a=parent[a]
+        return a
+    for i in range(k):
+        for j in range(i+1, k):
+            if sim[i,j] >= thresh:
+                pi, pj = find(i), find(j)
+                if pi!=pj: parent[pj]=pi
+    root = {i:find(i) for i in range(k)}
+    idmap, new_id = {}, 0
+    for i in range(k):
+        r = root[i]
+        if r not in idmap:
+            idmap[r] = new_id
+            new_id += 1
+    labels2 = np.array([idmap[root[int(c)]] for c in labels], dtype=int)
+    return labels2
+# =================== Seeded centroids (only if LSA enabled) ===================
+def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVectorizer,
+                            lsa_components: np.ndarray, norm_obj: Normalizer,
+                            d_word: int, d_full: int, k: int) -> Optional[np.ndarray]:
+    # Build a few unit vectors in word-term space based on lexicons
+    seeds_word = []
+    vocab = count_vec.vocabulary_
+    for _, words in lexicons.items():
+        idxs = [vocab.get(w.lower()) for w in words if vocab.get(w.lower()) is not None]
+        if not idxs:
+            continue
+        v = np.zeros((d_word,), dtype=np.float32)
+        v[idxs] = 1.0
+        n = np.linalg.norm(v)
+        if n > 0:
+            v /= n
+            seeds_word.append(v)
+    if not seeds_word:
+        return None
+    # Lift to full feature space (word + char) by padding zeros for char dims
+    seeds_full = []
+    for v in seeds_word:
+        vf = np.zeros((d_full,), dtype=np.float32)
+        vf[:d_word] = v
+        seeds_full.append(vf)
+    seeds_full = np.stack(seeds_full, axis=0)  # (s, n_features)
+    # Project to LSA space: x @ components_.T then normalize
+    seeds_red = seeds_full @ lsa_components.T   # (s, lsa_dim)
+    seeds_red = norm_obj.transform(seeds_red.astype(np.float32))
+    # If fewer than k seeds, KMeans will accept; scikit-learn requires init shape == (k, d)
+    # We’ll return only if seeds count >= 2 to be meaningful; otherwise None
+    if seeds_red.shape[0] >= 2 and seeds_red.shape[0] <= k:
+        return seeds_red
+    return None
+# =================== Corruption scoring ===================
+def corruption_score(row):
+    score = 0.0
+    txt = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
+    for ph in SUSPECT_PHRASES:
+        if ph in txt:
+            score += 2.0
+            break
+    if isinstance(row.get("tags"), list) and ("🚩suspect" in row["tags"] or "finance" in row["tags"]):
+        score += 1.5
+    if MONEY_RE.search(txt):   score += 0.7
+    if INVOICE_RE.search(txt): score += 0.7
+    if str(row.get("sentiment","")) == "negative":
+        score += 0.3
+    body_len = len(row.get("body_text",""))
+    if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
+        score += 0.5
+    return score
 # =================== Gradio UI ===================
 CSS = """
 :root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
 with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
     gr.Markdown("""
+    # Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans
+    **Goal:** quickly surface potentially corruption-related emails via topic clusters, tags, corruption score, and sentiment.
     """)
     with gr.Row():
     with gr.Accordion("Vectorization & Clustering", open=True):
         with gr.Row():
+            max_features = gr.Number(label="Word max_features (BM25)", value=120_000, precision=0)
             min_df = gr.Number(label="min_df (doc freq ≥)", value=2, precision=0)
             max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
             use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
         with gr.Row():
             use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
             lsa_dim = gr.Number(label="LSA components", value=150, precision=0)
+            auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
             k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
             mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
         with gr.Row():
+            use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
     with gr.Accordion("Filters", open=True):
         with gr.Row():
     # State
     state_df          = gr.State()          # full dataframe
+    state_vec         = gr.State()          # {"count_vec":..., "char_vec":..., "bm25":...}
     state_X_reduced   = gr.State()          # np.ndarray (LSA normalized) or None
     state_index       = gr.State()          # Faiss index or sklearn NN
     state_term_names  = gr.State()          # dict cluster_id -> label
     state_use_faiss   = gr.State()
     state_svd         = gr.State()
     state_norm        = gr.State()
+    state_dims        = gr.State()          # (d_word, d_char)
     # -------- IO helpers --------
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
     ) -> pd.DataFrame:
         out = df
         if cluster and cluster != "(any)":
             m = re.match(r"^(\d+)\s+—", cluster)
             if m:
                 cid = int(m.group(1))
         if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
             out = out[out["sentiment"].astype(str) == sentiment]
         if tag_value and tag_value != "(any)":
             out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
         if start:
             try:
                 dt = pd.to_datetime(start, utc=True, errors="coerce")
                      use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss):
         if inbox_file is None:
             return ("**Please upload a file.**",
+                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
         use_lang = not bool(skip_lang)
         recs = _load_json_records(inbox_file.name)
         if not recs:
             return ("**No valid records found.**",
+                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
         # Normalize
         normd = []
         df = pd.DataFrame(normd)
         if df.empty:
             return ("**No usable email records after normalization.**",
+                    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
         # Deduplicate conservatively
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
         df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
         df = compute_sentiment_column(df)
+        # Enriched texts (adds __HAS_*__ flags)
+        texts = list(df.apply(enrich_text, axis=1))
+        # === Vectorization: BM25 word + char tf-idf, then optional LSA ===
         ngram_range = (1, 2) if use_bigrams else (1, 1)
+        count_vec = CountVectorizer(
             analyzer="word",
             ngram_range=ngram_range,
             max_features=int(max_features) if max_features else None,
             max_df=float(max_df) if max_df else 0.7,
             token_pattern=TOKEN_PATTERN,
             lowercase=True,
             dtype=np.float32,
         )
+        TF = count_vec.fit_transform(texts)
+        bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
+        X_word = bm25.transform(TF)  # sparse BM25 word matrix
+        char_vec = CharTfidf(
+            analyzer="char", ngram_range=(3,5), min_df=2, max_features=100_000,
+            lowercase=True, dtype=np.float32
+        )
+        X_char = char_vec.fit_transform(texts)
+        X_full = hstack([X_word, X_char], format="csr")
+        d_word = X_word.shape[1]
+        d_char = X_char.shape[1]
+        d_full = X_full.shape[1]
         # LSA (TruncatedSVD + Normalizer) for stability/quality
         use_lsa = bool(use_lsa)
         norm_obj = None
         if use_lsa:
             svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
+            X_reduced_tmp = svd_obj.fit_transform(X_full)  # dense (n_docs x lsa_dim)
             norm_obj = Normalizer(copy=False)
             X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
             del X_reduced_tmp
             gc.collect()
+        # K selection
         if bool(auto_k):
+            if use_lsa:
+                k, _ = choose_k_by_kneedle(X_reduced, ks=(50,100,150,200,300,400,500))
+            else:
+                # fallback: heuristic rule on doc count
+                k = auto_k_rule(X_full.shape[0])
         else:
             k = max(10, int(k_clusters or 350))
+        # Optional seeded init (only in LSA space to keep memory sane)
+        init = None
+        if use_lsa:
+            seeds = seeded_centroids_in_lsa(
+                CORR_LEX, count_vec, svd_obj.components_, norm_obj,
+                d_word=d_word, d_full=d_full, k=k
+            )
+            if seeds is not None and seeds.shape[0] <= k:
+                # If fewer seeds than k, KMeans will handle by k-means++ for remaining centers internally only for KMeans.
+                # For MiniBatchKMeans, we must provide exactly k centers or fall back to k-means++.
+                # So use seeds only if seeds.shape[0] == k; otherwise None.
+                if seeds.shape[0] == k:
+                    init = seeds
+        # KMeans clustering (use LSA space if enabled)
+        X_space = (X_reduced if use_lsa else X_full)
         kmeans = MiniBatchKMeans(
             n_clusters=k,
             batch_size=int(mb_batch or 4096),
             random_state=0,
+            n_init="auto" if init is None else 1,
+            init="k-means++" if init is None else init
         )
+        labels = kmeans.fit_predict(X_space)
+        # Optional: merge very-similar clusters (only when LSA enabled)
+        if use_lsa:
+            labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.92)
         df["cluster_id"] = labels
+        # Name clusters by PMI bigrams on raw enriched texts
+        term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
+        df["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
+        # CorruptionScore
+        df["corruption_score"] = df.apply(corruption_score, axis=1)
         # Build search index
+        use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
         index_obj = None
+        if use_faiss:
             d = X_reduced.shape[1]
+            index_obj = faiss.IndexFlatIP(d)  # cosine ~ inner product on normalized vectors
             index_obj.add(X_reduced)
         else:
             nn = NearestNeighbors(metric="cosine", algorithm="brute")
+            nn.fit(X_space)
             index_obj = nn
         # Summaries
               .sort_values("count", ascending=False)
               .head(500)
         )
         cluster_counts["label"] = cluster_counts.apply(
             lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
         )
         )
         domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
+        # Results preview default: rank by corruption_score then date desc
+        show_df = df.copy()
+        if "date" in show_df.columns and show_df["date"].notna().any():
             show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
         else:
+            show_df["_dt"] = pd.NaT
+        show_df = show_df.sort_values(["corruption_score","_dt"], ascending=[False, False]).drop(columns=["_dt"])
+        cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment", "corruption_score"]
         out_table = show_df[cols_out].head(500)
+        vec_state = {"count_vec": count_vec, "char_vec": char_vec, "bm25": bm25}
         status_md = (
             f"**Processed {len(df):,} emails**  \n"
+            f"Word feats (BM25): {d_word:,}  |  Char feats: {d_char:,}  |  Total: {d_full:,}  \n"
             f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims  |  ' if use_lsa else ''}"
+            f"k = {k}  |  Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'}"
         )
         gc.collect()
         cluster_update = gr.update(choices=cluster_choices, value="(any)")
         domain_update  = gr.update(choices=domain_choices,  value="(any)")
             status_md,
             cluster_counts, domain_counts,
             out_table,
+            df, vec_state, (X_reduced if use_lsa else None), index_obj, term_names,
+            use_lsa, bool(use_faiss),
             cluster_update, domain_update,
+            svd_obj, norm_obj,
+            (d_word, d_char)
         )
     (run_btn.click)(
                  state_df, state_vec, state_X_reduced, state_index, state_term_names,
                  state_use_lsa, state_use_faiss,
                  cluster_drop, domain_drop,
+                 state_svd, state_norm,
+                 state_dims]
     )
     # -------- Filtering & Search --------
         if df is None or len(df) == 0:
             return pd.DataFrame()
         filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
+        cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment", "corruption_score"]
         if "date" in filt.columns and filt["date"].notna().any():
             tmp = filt.copy()
             tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
+            tmp = tmp.sort_values(["corruption_score","_dt"], ascending=[False, False]).drop(columns=["_dt"])
             return tmp[cols_out].head(500)
+        return filt.sort_values(["corruption_score"], ascending=False)[cols_out].head(500)
     for ctrl in [cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]:
         ctrl.change(
             outputs=[results_df]
         )
     reset_btn.click(
         lambda: [None, None, "(any)", "(any)", "", ""],
         inputs=[],
     def _tokenize_query(q: str) -> List[str]:
         if not q:
             return []
         parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
         seen, out = set(), []
         for p in parts:
             if p.lower() not in seen:
                 out.append(p)
                 seen.add(p.lower())
+        return out[:8]
     def _project_query_to_lsa(q_vec, svd_obj, norm_obj) -> Optional[np.ndarray]:
         try:
+            q_red = svd_obj.transform(q_vec)
+            q_red = norm_obj.transform(q_red)
             return q_red.astype(np.float32)
         except Exception:
             return None
+    def _vectorize_query(q: str, vec_state: Dict[str, Any]):
+        count_vec = vec_state["count_vec"]
+        char_vec  = vec_state["char_vec"]
+        bm25      = vec_state["bm25"]
+        q_word_tf = count_vec.transform([q])
+        q_word    = bm25.transform(q_word_tf)
+        q_char    = char_vec.transform([q])
+        q_full    = hstack([q_word, q_char], format="csr")
+        return q_full
+    def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj):
+        if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
             return pd.DataFrame(), []
         q_terms = _tokenize_query(q)
+        q_vec_full = _vectorize_query(q, vec_state)
         if use_lsa_flag and (X_reduced is not None):
+            q_emb = _project_query_to_lsa(q_vec_full, svd_obj, norm_obj)
             if q_emb is None:
                 return pd.DataFrame(), q_terms
         else:
+            q_emb = q_vec_full
         if isinstance(index_obj, NearestNeighbors):
             distances, indices = index_obj.kneighbors(q_emb, n_neighbors=min(50, len(df)))
             sims = 1.0 - distances[0]
             results = df.iloc[inds].copy()
             results["score"] = sims
+        elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
             D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df)))
             inds = I[0]
             sims = D[0]
         else:
             return pd.DataFrame(), q_terms
+        cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment", "corruption_score", "score"]
+        # Rerank by a blend: 0.7 * ANN score + 0.3 * corruption_score (scaled)
+        cs = results["corruption_score"].fillna(0.0)
+        cs = (cs - cs.min()) / (cs.max() - cs.min() + 1e-9)
+        results["_blend"] = 0.7*results["score"].values + 0.3*cs.values
+        results = results.sort_values("_blend", ascending=False).drop(columns=["_blend"])
         return results[cols].head(50), q_terms
     search_btn.click(
             row_idx = evt.index if hasattr(evt, "index") else None
         if row_idx is None or table is None or len(table) == 0 or df is None or len(df) == 0:
             return ""
         sel = table.iloc[row_idx]
         subj = sel.get("subject", None)
         frm  = sel.get("from_email", None)
         dstr = sel.get("date", None)
         cand = df
         if subj is not None:
             cand = cand[cand["subject"] == subj]