Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Aug 31, 2025

Commit

8336b56

verified ·

1 Parent(s): 2ebeb60

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -99

app.py CHANGED Viewed

@@ -38,31 +38,17 @@ except Exception:
     VADER_OK = False
 # =================== Regex & Flags ===================
-# Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
 TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
-# URLs -> "URL" (reduce feature bloat). We DO NOT redact phone numbers per your request.
 URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
-# Quote lines ("> ...")
 QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
-# Signature separator: lines after "-- " (standard)
 SIG_RE = re.compile(r"\n-- ?\n", re.M)
-# Device footers
 SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
 HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
-# Forward/quoted markers
 FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
 FWD_MSG_RE   = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
 ON_WROTE_RE  = re.compile(r'^\s*On .* wrote:$', re.M)
-# Toggle for language detection (skip for speed)
 SKIP_LANGDETECT = True
-# Corruption keyword/phrase list (you can extend freely)
 SUSPECT_PHRASES = [
     "off the books", "cover up", "kickback", "bribe", "under the table",
     "no inspection", "special fee", "friendly payment", "confidential deal",
@@ -81,18 +67,14 @@ def html_to_text(html: str) -> str:
     return soup.get_text(separator="\n")
 def strip_quotes_and_sigs(text: str) -> str:
-    """Drop quoted lines, signatures, device footers, forwarded chains."""
     if not text:
         return ""
     text = QUOTE_LINE_RE.sub("", text)
     parts = SIG_RE.split(text)
     if parts:
         text = parts[0]
     text = SENT_FROM_RE.sub("", text)
     text = HEBREW_SENT_FROM_RE.sub("", text)
     cut = None
     for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
         m = pat.search(text)
@@ -101,11 +83,9 @@ def strip_quotes_and_sigs(text: str) -> str:
             cut = idx if (cut is None or idx < cut) else cut
     if cut is not None:
         text = text[:cut]
     return text.strip()
 def parse_name_email(s: str) -> Tuple[str, str]:
-    """Split 'Name <email>' into (name, email)."""
     if not s:
         return "", ""
     m = re.match(r'(?:"?([^"]*)"?\s)?<?([^<>]+@[^<>]+)>?', s)
@@ -114,16 +94,11 @@ def parse_name_email(s: str) -> Tuple[str, str]:
     return "", s.strip()
 def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
-    """
-    Extract inline headers (From, To, CC, Date, Subject) from the text blob.
-    Returns (headers_dict, remaining_body_text).
-    """
     headers: Dict[str, str] = {}
     lines = (text or "").splitlines()
     header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject):')
     i = 0
     saw_header = False
     while i < len(lines):
         line = lines[i].rstrip("\r")
         stripped = line.strip()
@@ -161,39 +136,30 @@ def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
                 break
             else:
                 break
     body_text = "\n".join(lines[i:]) if i < len(lines) else ""
     return headers, body_text
 # =================== Normalization & Utilities ===================
 def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[str, Any]:
-    """Normalize a single raw record into a structured row."""
     if str(raw.get("type", "")).lower() == "meta":
         return {}
     body_text_raw = raw.get("body_text") or raw.get("text") or ""
     html_content  = raw.get("body_html") or raw.get("html") or ""
     if html_content and not body_text_raw:
         body_text_raw = html_to_text(html_content)
     body_text_raw = ftfy.fix_text(body_text_raw or "")
     subject_text = ""
     from_name = from_email = from_domain = ""
     date_val = raw.get("date") or raw.get("Date") or ""
     if body_text_raw:
         headers, body_only = parse_email_headers(body_text_raw)
         subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
         sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
         date_val = headers.get("Date", "") or date_val
-        # Clean body: NO phone redaction, per your request
         body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
         body_clean = URL_RE.sub(" URL ", body_clean)
         body_clean = re.sub(r"\s+", " ", body_clean).strip()
         body_text = body_clean
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
     else:
@@ -205,9 +171,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
         sender = raw.get("from") or raw.get("From") or ""
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
     subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
     if use_langdetect:
         try:
             lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
@@ -215,7 +179,6 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
             lang = "unknown"
     else:
         lang = "unknown"
     iso_date = ""
     if isinstance(date_val, (int, float)):
         try:
@@ -224,15 +187,12 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
             iso_date = ""
     elif isinstance(date_val, str) and date_val:
         iso_date = pd.to_datetime(date_val, utc=True, errors="coerce").isoformat()
     msg_id = raw.get("message_id") or raw.get("Message-ID") or ""
     if not msg_id:
         msg_id = f"gen-{uuid.uuid4().hex}"
     thread_key = subject_norm or (from_email + body_text[:120])
     thread_id = str(pd.util.hash_pandas_object(pd.Series([thread_key], dtype="string")).astype("uint64").iloc[0])
     text_hash = str(pd.util.hash_pandas_object(pd.Series([body_text], dtype="string")).astype("uint64").iloc[0]) if body_text else ""
     return {
         "message_id": str(msg_id),
         "thread_id": thread_id,
@@ -247,22 +207,20 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
     }
 def has_suspect_tag(text: str) -> List[str]:
-    """Return list of corruption/suspicion tags present in text."""
     tags = []
     if not text:
         return tags
     low = text.lower()
-    hits = []
     for phrase in SUSPECT_PHRASES:
         if phrase in low:
-            hits.append("🚩suspect")
             break
     if "invoice" in low or "payment" in low or "contract" in low:
-        hits.append("finance")
     if "wire" in low or "transfer" in low or "cash" in low:
-        if "finance" not in hits:
-            hits.append("finance")
-    return hits
 def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
     if not VADER_OK:
@@ -272,14 +230,12 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
     analyzer = SentimentIntensityAnalyzer()
     scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
     df["sentiment_score"] = scores
-    # VADER thresholds: [-1,-0.05), (-0.05,0.05), (0.05,1]
     bins = [-1.01, -0.05, 0.05, 1.01]
     labels = ["negative", "neutral", "positive"]
     df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
     return df
 def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str]=None) -> str:
-    """Email reader HTML with highlighted query terms and visible tags."""
     subject = (row.get("subject") or "").strip()
     body    = (row.get("body_text") or "").strip()
     from_email = row.get("from_email") or ""
@@ -303,12 +259,8 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
     subject_h = hi(subject)
     body_h    = hi(body)
-    # Basic RTL detection for Hebrew/Arabic chars → add dir="rtl"
     rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
     dir_attr = ' dir="rtl"' if rtl else ""
-    # PRECOMPUTE to avoid backslashes inside f-string expressions
     body_html = body_h.replace("\n", "<br/>")
     tag_html = ""
@@ -346,7 +298,6 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
         if mask.sum() == 0:
             out[int(c)] = f"cluster_{c}"
             continue
-        # mean TF-IDF per feature inside cluster
         mean_vec = X[mask].mean(axis=0).A1
         if mean_vec.size == 0:
             out[int(c)] = f"cluster_{c}"
@@ -358,7 +309,6 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
     return out
 def auto_k_rule(n_docs: int) -> int:
-    # Sublinear scaling; keeps clusters between ~120 and 600 for big corpora
     return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
 # =================== Gradio UI ===================
@@ -430,16 +380,17 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     with gr.Row():
         search_query = gr.Textbox(label="Search (keywords, names, etc.)")
         search_btn = gr.Button("Search")
-    results_df = gr.Dataframe(label="Results (top 500 or top 50 for search)", interactive=True, wrap=True, height=360)
     email_view = gr.HTML(label="Reader")
     # State
-    state_df          = gr.State()  # full dataframe
-    state_vec         = gr.State()  # TfidfVectorizer
-    state_X_reduced   = gr.State()  # np.ndarray (LSA normalized) or None
-    state_index       = gr.State()  # Faiss index or sklearn NN
-    state_term_names  = gr.State()  # dict cluster_id -> label
-    state_query_terms = gr.State()  # last search terms list
     state_use_lsa     = gr.State()
     state_use_faiss   = gr.State()
@@ -480,7 +431,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                        start: str, end: str) -> pd.DataFrame:
         out = df
         if cluster and cluster != "(any)":
-            # cluster values like "12 — payment, contract (534)"
             m = re.match(r"^(\d+)\s+—", cluster)
             if m:
                 cid = int(m.group(1))
@@ -490,9 +440,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
             out = out[out["sentiment"].astype(str) == sentiment]
         if tag_value and tag_value != "(any)":
-            # tags is a list; check membership robustly
             out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
-        # date bounds
         if start:
             try:
                 dt = pd.to_datetime(start, utc=True, errors="coerce")
@@ -521,7 +469,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             return ("**No valid records found.**",
                     None, None, None, None, None, None, None, None, None, None)
-        # Normalize
         normd = []
         for r in tqdm(recs, desc="Normalize", leave=False):
             out = normalize_email_record(r, use_langdetect=use_lang)
@@ -532,17 +479,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             return ("**No usable email records after normalization.**",
                     None, None, None, None, None, None, None, None, None, None)
-        # Deduplicate conservatively
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
-        # Tags (suspect/finance) + Sentiment
         df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
         df = compute_sentiment_column(df)
-        # Texts for modeling
         texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
-        # TF-IDF (sparse CSR float32)
         ngram_range = (1, 2) if use_bigrams else (1, 1)
         vec = TfidfVectorizer(
             analyzer="word",
@@ -555,20 +498,18 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             sublinear_tf=True,
             dtype=np.float32,
         )
-        X = vec.fit_transform(texts)  # CSR float32
-        # LSA (TruncatedSVD + Normalizer) for stability/quality
         use_lsa = bool(use_lsa)
         X_reduced = None
         if use_lsa:
             svd = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
-            X_reduced_tmp = svd.fit_transform(X)  # dense (n_docs x lsa_dim)
             normalizer = Normalizer(copy=False)
             X_reduced = normalizer.fit_transform(X_reduced_tmp).astype(np.float32)
             del X_reduced_tmp
             gc.collect()
-        # KMeans clustering
         if bool(auto_k):
             k = auto_k_rule(X.shape[0])
         else:
@@ -583,32 +524,26 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         labels = kmeans.fit_predict(X_reduced if use_lsa else X)
         df["cluster_id"] = labels
-        # Name clusters by top terms (use original TF-IDF for interpretability)
         term_names = top_terms_per_cluster(X, labels, vec, topn=6)
         df["cluster_name"] = [term_names[int(c)] for c in labels]
-        # Build search index
         use_faiss = bool(use_faiss) and FAISS_OK
         index_obj = None
         if use_faiss and use_lsa:
-            # cosine ≈ inner product on normalized vectors
             d = (X_reduced.shape[1])
             index_obj = faiss.IndexFlatIP(d)
             index_obj.add(X_reduced)
         else:
-            # fallback to brute-force cosine on TF-IDF or reduced vectors
             nn = NearestNeighbors(metric="cosine", algorithm="brute")
             nn.fit(X_reduced if use_lsa else X)
             index_obj = nn
-        # Summaries
         cluster_counts = (
             df.groupby(["cluster_id", "cluster_name"]).size()
               .reset_index(name="count")
               .sort_values("count", ascending=False)
               .head(500)
         )
-        # For dropdown labels: "id — label (count)"
         cluster_counts["label"] = cluster_counts.apply(
             lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
         )
@@ -622,10 +557,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         )
         domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
-        # Results preview default (latest 500 by date if available)
         if "date" in df.columns and df["date"].notna().any():
             show_df = df.copy()
-            # coerce to datetime for sort
             show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
             show_df = show_df.sort_values("_dt", ascending=False).drop(columns=["_dt"])
         else:
@@ -641,7 +574,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             f"k = {k}  |  Search = {'Faiss (IP on LSA)' if (use_faiss and use_lsa and FAISS_OK) else 'cosine brute-force'}"
         )
-        # Free some heavy temporaries from local scope
         gc.collect()
         return (status_md,
@@ -651,7 +583,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 use_lsa, (use_faiss and use_lsa and FAISS_OK),
                 cluster_choices, domain_choices)
-    # Wire process
     (run_btn.click)(
         process_file,
         inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
@@ -670,7 +601,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             return pd.DataFrame()
         filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
         cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
-        # default: sort by date desc if possible
         if "date" in filt.columns and filt["date"].notna().any():
             tmp = filt.copy()
             tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
@@ -698,29 +628,20 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     def _tokenize_query(q: str) -> List[str]:
         if not q:
             return []
-        # split on spaces, keep simple tokens; short stop words aren’t filtered to keep behavior explicit
         parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
-        # dedupe while preserving order
         seen, out = set(), []
         for p in parts:
             if p.lower() not in seen:
                 out.append(p)
                 seen.add(p.lower())
-        return out[:8]  # limit highlights for performance
     def search_fn(q, df, vec, X_reduced, index_obj, use_lsa_flag, use_faiss_flag):
         if (not q) or (df is None) or (vec is None) or (index_obj is None):
             return pd.DataFrame(), []
         q_terms = _tokenize_query(q)
-        # Vectorize the query
         q_vec = vec.transform([q])
-        if use_lsa_flag and X_reduced is not None:
-            # Ideally, project q with the same SVD+Normalizer; since we didn't persist them,
-            # we fall back to the TF-IDF brute-force path below.
-            pass
-        # If we have a sklearn NearestNeighbors (cosine brute-force)
         if isinstance(index_obj, NearestNeighbors):
             distances, indices = index_obj.kneighbors(q_vec, n_neighbors=min(50, len(df)))
             inds = indices[0]
@@ -728,8 +649,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             results = df.iloc[inds].copy()
             results["score"] = sims
         elif FAISS_OK and isinstance(index_obj, faiss.Index):
-            # Can't project the query into LSA here without the SVD/Normalizer objects;
-            # so skip ANN for ad-hoc queries and return no results (or implement a TF-IDF fallback if you keep X).
             return pd.DataFrame(), q_terms
         else:
             return pd.DataFrame(), q_terms
@@ -750,12 +670,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             row_idx = evt.index if hasattr(evt, "index") else None
         if row_idx is None or table is None or len(table) == 0 or df is None or len(df) == 0:
             return ""
-        # Get identifying columns from the table row to map back to original df row
         sel = table.iloc[row_idx]
         subj = sel.get("subject", None)
         frm  = sel.get("from_email", None)
         dstr = sel.get("date", None)
-        # match in original df
         cand = df
         if subj is not None:
             cand = cand[cand["subject"] == subj]

     VADER_OK = False
 # =================== Regex & Flags ===================
 TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
 URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
 QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
 SIG_RE = re.compile(r"\n-- ?\n", re.M)
 SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
 HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
 FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
 FWD_MSG_RE   = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
 ON_WROTE_RE  = re.compile(r'^\s*On .* wrote:$', re.M)
 SKIP_LANGDETECT = True
 SUSPECT_PHRASES = [
     "off the books", "cover up", "kickback", "bribe", "under the table",
     "no inspection", "special fee", "friendly payment", "confidential deal",
     return soup.get_text(separator="\n")
 def strip_quotes_and_sigs(text: str) -> str:
     if not text:
         return ""
     text = QUOTE_LINE_RE.sub("", text)
     parts = SIG_RE.split(text)
     if parts:
         text = parts[0]
     text = SENT_FROM_RE.sub("", text)
     text = HEBREW_SENT_FROM_RE.sub("", text)
     cut = None
     for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
         m = pat.search(text)
             cut = idx if (cut is None or idx < cut) else cut
     if cut is not None:
         text = text[:cut]
     return text.strip()
 def parse_name_email(s: str) -> Tuple[str, str]:
     if not s:
         return "", ""
     m = re.match(r'(?:"?([^"]*)"?\s)?<?([^<>]+@[^<>]+)>?', s)
     return "", s.strip()
 def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
     headers: Dict[str, str] = {}
     lines = (text or "").splitlines()
     header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject):')
     i = 0
     saw_header = False
     while i < len(lines):
         line = lines[i].rstrip("\r")
         stripped = line.strip()
                 break
             else:
                 break
     body_text = "\n".join(lines[i:]) if i < len(lines) else ""
     return headers, body_text
 # =================== Normalization & Utilities ===================
 def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[str, Any]:
     if str(raw.get("type", "")).lower() == "meta":
         return {}
     body_text_raw = raw.get("body_text") or raw.get("text") or ""
     html_content  = raw.get("body_html") or raw.get("html") or ""
     if html_content and not body_text_raw:
         body_text_raw = html_to_text(html_content)
     body_text_raw = ftfy.fix_text(body_text_raw or "")
     subject_text = ""
     from_name = from_email = from_domain = ""
     date_val = raw.get("date") or raw.get("Date") or ""
     if body_text_raw:
         headers, body_only = parse_email_headers(body_text_raw)
         subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
         sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
         date_val = headers.get("Date", "") or date_val
         body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
         body_clean = URL_RE.sub(" URL ", body_clean)
         body_clean = re.sub(r"\s+", " ", body_clean).strip()
         body_text = body_clean
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
     else:
         sender = raw.get("from") or raw.get("From") or ""
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
     subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
     if use_langdetect:
         try:
             lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
             lang = "unknown"
     else:
         lang = "unknown"
     iso_date = ""
     if isinstance(date_val, (int, float)):
         try:
             iso_date = ""
     elif isinstance(date_val, str) and date_val:
         iso_date = pd.to_datetime(date_val, utc=True, errors="coerce").isoformat()
     msg_id = raw.get("message_id") or raw.get("Message-ID") or ""
     if not msg_id:
         msg_id = f"gen-{uuid.uuid4().hex}"
     thread_key = subject_norm or (from_email + body_text[:120])
     thread_id = str(pd.util.hash_pandas_object(pd.Series([thread_key], dtype="string")).astype("uint64").iloc[0])
     text_hash = str(pd.util.hash_pandas_object(pd.Series([body_text], dtype="string")).astype("uint64").iloc[0]) if body_text else ""
     return {
         "message_id": str(msg_id),
         "thread_id": thread_id,
     }
 def has_suspect_tag(text: str) -> List[str]:
     tags = []
     if not text:
         return tags
     low = text.lower()
     for phrase in SUSPECT_PHRASES:
         if phrase in low:
+            tags.append("🚩suspect")
             break
     if "invoice" in low or "payment" in low or "contract" in low:
+        tags.append("finance")
     if "wire" in low or "transfer" in low or "cash" in low:
+        if "finance" not in tags:
+            tags.append("finance")
+    return tags
 def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
     if not VADER_OK:
     analyzer = SentimentIntensityAnalyzer()
     scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
     df["sentiment_score"] = scores
     bins = [-1.01, -0.05, 0.05, 1.01]
     labels = ["negative", "neutral", "positive"]
     df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
     return df
 def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str]=None) -> str:
     subject = (row.get("subject") or "").strip()
     body    = (row.get("body_text") or "").strip()
     from_email = row.get("from_email") or ""
     subject_h = hi(subject)
     body_h    = hi(body)
     rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
     dir_attr = ' dir="rtl"' if rtl else ""
     body_html = body_h.replace("\n", "<br/>")
     tag_html = ""
         if mask.sum() == 0:
             out[int(c)] = f"cluster_{c}"
             continue
         mean_vec = X[mask].mean(axis=0).A1
         if mean_vec.size == 0:
             out[int(c)] = f"cluster_{c}"
     return out
 def auto_k_rule(n_docs: int) -> int:
     return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
 # =================== Gradio UI ===================
     with gr.Row():
         search_query = gr.Textbox(label="Search (keywords, names, etc.)")
         search_btn = gr.Button("Search")
+    # Removed unsupported `height` arg for older Gradio
+    results_df = gr.Dataframe(label="Results (top 500 or top 50 for search)", interactive=True, wrap=True)
     email_view = gr.HTML(label="Reader")
     # State
+    state_df          = gr.State()
+    state_vec         = gr.State()
+    state_X_reduced   = gr.State()
+    state_index       = gr.State()
+    state_term_names  = gr.State()
+    state_query_terms = gr.State()
     state_use_lsa     = gr.State()
     state_use_faiss   = gr.State()
                        start: str, end: str) -> pd.DataFrame:
         out = df
         if cluster and cluster != "(any)":
             m = re.match(r"^(\d+)\s+—", cluster)
             if m:
                 cid = int(m.group(1))
         if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
             out = out[out["sentiment"].astype(str) == sentiment]
         if tag_value and tag_value != "(any)":
             out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
         if start:
             try:
                 dt = pd.to_datetime(start, utc=True, errors="coerce")
             return ("**No valid records found.**",
                     None, None, None, None, None, None, None, None, None, None)
         normd = []
         for r in tqdm(recs, desc="Normalize", leave=False):
             out = normalize_email_record(r, use_langdetect=use_lang)
             return ("**No usable email records after normalization.**",
                     None, None, None, None, None, None, None, None, None, None)
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
         df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
         df = compute_sentiment_column(df)
         texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
         ngram_range = (1, 2) if use_bigrams else (1, 1)
         vec = TfidfVectorizer(
             analyzer="word",
             sublinear_tf=True,
             dtype=np.float32,
         )
+        X = vec.fit_transform(texts)
         use_lsa = bool(use_lsa)
         X_reduced = None
         if use_lsa:
             svd = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
+            X_reduced_tmp = svd.fit_transform(X)
             normalizer = Normalizer(copy=False)
             X_reduced = normalizer.fit_transform(X_reduced_tmp).astype(np.float32)
             del X_reduced_tmp
             gc.collect()
         if bool(auto_k):
             k = auto_k_rule(X.shape[0])
         else:
         labels = kmeans.fit_predict(X_reduced if use_lsa else X)
         df["cluster_id"] = labels
         term_names = top_terms_per_cluster(X, labels, vec, topn=6)
         df["cluster_name"] = [term_names[int(c)] for c in labels]
         use_faiss = bool(use_faiss) and FAISS_OK
         index_obj = None
         if use_faiss and use_lsa:
             d = (X_reduced.shape[1])
             index_obj = faiss.IndexFlatIP(d)
             index_obj.add(X_reduced)
         else:
             nn = NearestNeighbors(metric="cosine", algorithm="brute")
             nn.fit(X_reduced if use_lsa else X)
             index_obj = nn
         cluster_counts = (
             df.groupby(["cluster_id", "cluster_name"]).size()
               .reset_index(name="count")
               .sort_values("count", ascending=False)
               .head(500)
         )
         cluster_counts["label"] = cluster_counts.apply(
             lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
         )
         )
         domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
         if "date" in df.columns and df["date"].notna().any():
             show_df = df.copy()
             show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
             show_df = show_df.sort_values("_dt", ascending=False).drop(columns=["_dt"])
         else:
             f"k = {k}  |  Search = {'Faiss (IP on LSA)' if (use_faiss and use_lsa and FAISS_OK) else 'cosine brute-force'}"
         )
         gc.collect()
         return (status_md,
                 use_lsa, (use_faiss and use_lsa and FAISS_OK),
                 cluster_choices, domain_choices)
     (run_btn.click)(
         process_file,
         inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
             return pd.DataFrame()
         filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
         cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
         if "date" in filt.columns and filt["date"].notna().any():
             tmp = filt.copy()
             tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
     def _tokenize_query(q: str) -> List[str]:
         if not q:
             return []
         parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
         seen, out = set(), []
         for p in parts:
             if p.lower() not in seen:
                 out.append(p)
                 seen.add(p.lower())
+        return out[:8]
     def search_fn(q, df, vec, X_reduced, index_obj, use_lsa_flag, use_faiss_flag):
         if (not q) or (df is None) or (vec is None) or (index_obj is None):
             return pd.DataFrame(), []
         q_terms = _tokenize_query(q)
         q_vec = vec.transform([q])
         if isinstance(index_obj, NearestNeighbors):
             distances, indices = index_obj.kneighbors(q_vec, n_neighbors=min(50, len(df)))
             inds = indices[0]
             results = df.iloc[inds].copy()
             results["score"] = sims
         elif FAISS_OK and isinstance(index_obj, faiss.Index):
+            # Without persisted SVD/Normalizer, we can't project q to LSA; skip ANN here.
             return pd.DataFrame(), q_terms
         else:
             return pd.DataFrame(), q_terms
             row_idx = evt.index if hasattr(evt, "index") else None
         if row_idx is None or table is None or len(table) == 0 or df is None or len(df) == 0:
             return ""
         sel = table.iloc[row_idx]
         subj = sel.get("subject", None)
         frm  = sel.get("from_email", None)
         dstr = sel.get("date", None)
         cand = df
         if subj is not None:
             cand = cand[cand["subject"] == subj]