Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Aug 31, 2025

Commit

c8241c4

verified ·

1 Parent(s): c8dbc29

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -52

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ from sklearn.preprocessing import Normalizer
 # Optional fast ANN (CPU)
 try:
-    import faiss                   # faiss-cpu on HF Space
     FAISS_OK = True
 except Exception:
     FAISS_OK = False
@@ -38,17 +38,31 @@ except Exception:
     VADER_OK = False
 # =================== Regex & Flags ===================
 TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
 URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
 QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
 SIG_RE = re.compile(r"\n-- ?\n", re.M)
 SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
 HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
 FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
-FWD_MSG_RE   = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
-ON_WROTE_RE  = re.compile(r'^\s*On .* wrote:$', re.M)
 SKIP_LANGDETECT = True
 SUSPECT_PHRASES = [
     "off the books", "cover up", "kickback", "bribe", "under the table",
     "no inspection", "special fee", "friendly payment", "confidential deal",
@@ -67,14 +81,22 @@ def html_to_text(html: str) -> str:
     return soup.get_text(separator="\n")
 def strip_quotes_and_sigs(text: str) -> str:
     if not text:
         return ""
     text = QUOTE_LINE_RE.sub("", text)
     parts = SIG_RE.split(text)
     if parts:
         text = parts[0]
-    text = SENTO_FROM_RE.sub("", text) if (SENT_FROM_RE := SENT_FROM_RE) else text  # keep name stable
     text = HEBREW_SENT_FROM_RE.sub("", text)
     cut = None
     for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
         m = pat.search(text)
@@ -83,9 +105,11 @@ def strip_quotes_and_sigs(text: str) -> str:
             cut = idx if (cut is None or idx < cut) else cut
     if cut is not None:
         text = text[:cut]
     return text.strip()
 def parse_name_email(s: str) -> Tuple[str, str]:
     if not s:
         return "", ""
     m = re.match(r'(?:"?([^"]*)"?\s)?<?([^<>]+@[^<>]+)>?', s)
@@ -94,11 +118,16 @@ def parse_name_email(s: str) -> Tuple[str, str]:
     return "", s.strip()
 def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
     headers: Dict[str, str] = {}
     lines = (text or "").splitlines()
     header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject):')
     i = 0
     saw_header = False
     while i < len(lines):
         line = lines[i].rstrip("\r")
         stripped = line.strip()
@@ -136,30 +165,39 @@ def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
                 break
             else:
                 break
     body_text = "\n".join(lines[i:]) if i < len(lines) else ""
     return headers, body_text
 # =================== Normalization & Utilities ===================
 def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[str, Any]:
     if str(raw.get("type", "")).lower() == "meta":
         return {}
     body_text_raw = raw.get("body_text") or raw.get("text") or ""
-    html_content  = raw.get("body_html") or raw.get("html") or ""
     if html_content and not body_text_raw:
         body_text_raw = html_to_text(html_content)
     body_text_raw = ftfy.fix_text(body_text_raw or "")
     subject_text = ""
     from_name = from_email = from_domain = ""
     date_val = raw.get("date") or raw.get("Date") or ""
     if body_text_raw:
         headers, body_only = parse_email_headers(body_text_raw)
         subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
         sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
         date_val = headers.get("Date", "") or date_val
         body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
         body_clean = URL_RE.sub(" URL ", body_clean)
         body_clean = re.sub(r"\s+", " ", body_clean).strip()
         body_text = body_clean
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
     else:
@@ -171,7 +209,9 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
         sender = raw.get("from") or raw.get("From") or ""
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
     subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
     if use_langdetect:
         try:
             lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
@@ -179,6 +219,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
             lang = "unknown"
     else:
         lang = "unknown"
     iso_date = ""
     if isinstance(date_val, (int, float)):
         try:
@@ -187,12 +228,15 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
             iso_date = ""
     elif isinstance(date_val, str) and date_val:
         iso_date = pd.to_datetime(date_val, utc=True, errors="coerce").isoformat()
     msg_id = raw.get("message_id") or raw.get("Message-ID") or ""
     if not msg_id:
         msg_id = f"gen-{uuid.uuid4().hex}"
     thread_key = subject_norm or (from_email + body_text[:120])
     thread_id = str(pd.util.hash_pandas_object(pd.Series([thread_key], dtype="string")).astype("uint64").iloc[0])
     text_hash = str(pd.util.hash_pandas_object(pd.Series([body_text], dtype="string")).astype("uint64").iloc[0]) if body_text else ""
     return {
         "message_id": str(msg_id),
         "thread_id": thread_id,
@@ -207,6 +251,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
     }
 def has_suspect_tag(text: str) -> List[str]:
     tags = []
     if not text:
         return tags
@@ -230,17 +275,19 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
     analyzer = SentimentIntensityAnalyzer()
     scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
     df["sentiment_score"] = scores
     bins = [-1.01, -0.05, 0.05, 1.01]
     labels = ["negative", "neutral", "positive"]
     df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
     return df
-def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str]=None) -> str:
     subject = (row.get("subject") or "").strip()
-    body    = (row.get("body_text") or "").strip()
     from_email = row.get("from_email") or ""
-    date    = row.get("date") or ""
-    tags    = row.get("tags") or []
     sentiment = row.get("sentiment") or "(unknown)"
     def hi(text: str) -> str:
@@ -258,9 +305,12 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
         return out
     subject_h = hi(subject)
-    body_h    = hi(body)
     rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
     dir_attr = ' dir="rtl"' if rtl else ""
     body_html = body_h.replace("\n", "<br/>")
     tag_html = ""
@@ -294,10 +344,11 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
     out = {}
     uniq = np.unique(labels)
     for c in uniq:
-        mask = (labels == c)
         if mask.sum() == 0:
             out[int(c)] = f"cluster_{c}"
             continue
         mean_vec = X[mask].mean(axis=0).A1
         if mean_vec.size == 0:
             out[int(c)] = f"cluster_{c}"
@@ -309,6 +360,7 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
     return out
 def auto_k_rule(n_docs: int) -> int:
     return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
 # =================== Gradio UI ===================
@@ -365,7 +417,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             )
         with gr.Row():
             date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
-            date_end   = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
     with gr.Row():
         run_btn = gr.Button("Process", variant="primary")
@@ -374,7 +426,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     with gr.Row():
         cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
-        domain_counts_df  = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
     gr.Markdown("### Search")
     with gr.Row():
@@ -384,16 +436,16 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     email_view = gr.HTML(label="Reader")
     # State
-    state_df          = gr.State()
-    state_vec         = gr.State()
-    state_X_reduced   = gr.State()
-    state_index       = gr.State()
-    state_term_names  = gr.State()
-    state_query_terms = gr.State()
-    state_use_lsa     = gr.State()
-    state_use_faiss   = gr.State()
-    state_svd         = gr.State()
-    state_norm        = gr.State()
     # -------- IO helpers --------
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
@@ -424,14 +476,18 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                         recs = [obj]
         return recs
-    def _apply_filters(df: pd.DataFrame,
-                       cluster: Optional[str],
-                       domain: Optional[str],
-                       sentiment: str,
-                       tag_value: str,
-                       start: str, end: str) -> pd.DataFrame:
         out = df
         if cluster and cluster != "(any)":
             m = re.match(r"^(\d+)\s+—", cluster)
             if m:
                 cid = int(m.group(1))
@@ -441,7 +497,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
             out = out[out["sentiment"].astype(str) == sentiment]
         if tag_value and tag_value != "(any)":
             out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
         if start:
             try:
                 dt = pd.to_datetime(start, utc=True, errors="coerce")
@@ -461,15 +519,16 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                      use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss):
         if inbox_file is None:
             return ("**Please upload a file.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None)
         use_lang = not bool(skip_lang)
         recs = _load_json_records(inbox_file.name)
         if not recs:
             return ("**No valid records found.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None)
         normd = []
         for r in tqdm(recs, desc="Normalize", leave=False):
             out = normalize_email_record(r, use_langdetect=use_lang)
@@ -478,15 +537,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         df = pd.DataFrame(normd)
         if df.empty:
             return ("**No usable email records after normalization.**",
-                    None, None, None, None, None, None, None, None, None, None, None, None)
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
         df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
         df = compute_sentiment_column(df)
         texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
         ngram_range = (1, 2) if use_bigrams else (1, 1)
         vec = TfidfVectorizer(
             analyzer="word",
@@ -499,20 +562,22 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             sublinear_tf=True,
             dtype=np.float32,
         )
-        X = vec.fit_transform(texts)
         use_lsa = bool(use_lsa)
         X_reduced = None
         svd_obj = None
         norm_obj = None
         if use_lsa:
             svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
-            X_reduced_tmp = svd_obj.fit_transform(X)
             norm_obj = Normalizer(copy=False)
             X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
             del X_reduced_tmp
             gc.collect()
         if bool(auto_k):
             k = auto_k_rule(X.shape[0])
         else:
@@ -527,26 +592,32 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         labels = kmeans.fit_predict(X_reduced if use_lsa else X)
         df["cluster_id"] = labels
         term_names = top_terms_per_cluster(X, labels, vec, topn=6)
         df["cluster_name"] = [term_names[int(c)] for c in labels]
         use_faiss = bool(use_faiss) and FAISS_OK
         index_obj = None
         if use_faiss and use_lsa:
-            d = (X_reduced.shape[1])
             index_obj = faiss.IndexFlatIP(d)
             index_obj.add(X_reduced)
         else:
             nn = NearestNeighbors(metric="cosine", algorithm="brute")
             nn.fit(X_reduced if use_lsa else X)
             index_obj = nn
         cluster_counts = (
             df.groupby(["cluster_id", "cluster_name"]).size()
               .reset_index(name="count")
               .sort_values("count", ascending=False)
               .head(500)
         )
         cluster_counts["label"] = cluster_counts.apply(
             lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
         )
@@ -560,8 +631,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         )
         domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
         if "date" in df.columns and df["date"].notna().any():
             show_df = df.copy()
             show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
             show_df = show_df.sort_values("_dt", ascending=False).drop(columns=["_dt"])
         else:
@@ -579,17 +652,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         gc.collect()
-        # IMPORTANT: use gr.update to set dropdown choices + default values
         cluster_update = gr.update(choices=cluster_choices, value="(any)")
-        domain_update  = gr.update(choices=domain_choices,  value="(any)")
-        return (status_md,
-                cluster_counts, domain_counts,
-                out_table,
-                df, vec, (X_reduced if use_lsa else None), index_obj, term_names,
-                use_lsa, (use_faiss and use_lsa and FAISS_OK),
-                cluster_update, domain_update,
-                svd_obj, norm_obj)
     (run_btn.click)(
         process_file,
@@ -610,6 +685,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             return pd.DataFrame()
         filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
         cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
         if "date" in filt.columns and filt["date"].notna().any():
             tmp = filt.copy()
             tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
@@ -624,8 +700,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             outputs=[results_df]
         )
     reset_btn.click(
-        lambda: ["(any)", "(any)", "(any)", "(any)", "", ""],
         inputs=[],
         outputs=[cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]
     ).then(
@@ -637,18 +714,20 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     def _tokenize_query(q: str) -> List[str]:
         if not q:
             return []
         parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
         seen, out = set(), []
         for p in parts:
             if p.lower() not in seen:
                 out.append(p)
                 seen.add(p.lower())
-        return out[:8]
     def _project_query_to_lsa(q_vec, svd_obj, norm_obj) -> Optional[np.ndarray]:
         try:
-            q_red = svd_obj.transform(q_vec)              # (1, lsa_dim)
-            q_red = norm_obj.transform(q_red)             # normalize
             return q_red.astype(np.float32)
         except Exception:
             return None
@@ -657,6 +736,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         if (not q) or (df is None) or (vec is None) or (index_obj is None):
             return pd.DataFrame(), []
         q_terms = _tokenize_query(q)
         q_vec = vec.transform([q])
         # Decide which space the index uses and project accordingly
@@ -691,17 +772,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         outputs=[results_df, state_query_terms]
     )
-    def on_row_select(evt: gr.SelectData, table: pd.DataFrame, df: pd.DataFrame, term_names: Dict[int,str], query_terms: Optional[List[str]]):
         try:
             row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
         except Exception:
             row_idx = evt.index if hasattr(evt, "index") else None
         if row_idx is None or table is None or len(table) == 0 or df is None or len(df) == 0:
             return ""
         sel = table.iloc[row_idx]
         subj = sel.get("subject", None)
-        frm  = sel.get("from_email", None)
         dstr = sel.get("date", None)
         cand = df
         if subj is not None:
             cand = cand[cand["subject"] == subj]
@@ -710,7 +793,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         if dstr is not None:
             cand = cand[cand["date"] == dstr]
         if len(cand) == 0:
-            cand = df[df["subject"] == sel.get("subject","")]
         if len(cand) == 0:
             return ""
         row = cand.iloc[0]

 # Optional fast ANN (CPU)
 try:
+    import faiss  # faiss-cpu on HF Space
     FAISS_OK = True
 except Exception:
     FAISS_OK = False
     VADER_OK = False
 # =================== Regex & Flags ===================
+# Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
 TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
+# URLs -> "URL" (reduce feature bloat). We DO NOT redact phone numbers per your request.
 URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
+# Quote lines ("> ...")
 QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
+# Signature separator: lines after "-- " (standard)
 SIG_RE = re.compile(r"\n-- ?\n", re.M)
+# Device footers
 SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
 HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
+# Forward/quoted markers
 FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
+FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
+ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
+# Toggle for language detection (skip for speed)
 SKIP_LANGDETECT = True
+# Corruption keyword/phrase list (you can extend freely)
 SUSPECT_PHRASES = [
     "off the books", "cover up", "kickback", "bribe", "under the table",
     "no inspection", "special fee", "friendly payment", "confidential deal",
     return soup.get_text(separator="\n")
 def strip_quotes_and_sigs(text: str) -> str:
+    """Drop quoted lines, signatures, device footers, forwarded chains."""
     if not text:
         return ""
+    # remove > quoted lines
     text = QUOTE_LINE_RE.sub("", text)
+    # cut everything after signature separator
     parts = SIG_RE.split(text)
     if parts:
         text = parts[0]
+    # remove device footers
+    text = SENT_FROM_RE.sub("", text)
     text = HEBREW_SENT_FROM_RE.sub("", text)
+    # trim forwarded/quoted chains
     cut = None
     for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
         m = pat.search(text)
             cut = idx if (cut is None or idx < cut) else cut
     if cut is not None:
         text = text[:cut]
     return text.strip()
 def parse_name_email(s: str) -> Tuple[str, str]:
+    """Split 'Name <email>' into (name, email)."""
     if not s:
         return "", ""
     m = re.match(r'(?:"?([^"]*)"?\s)?<?([^<>]+@[^<>]+)>?', s)
     return "", s.strip()
 def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
+    """
+    Extract inline headers (From, To, CC, Date, Subject) from the text blob.
+    Returns (headers_dict, remaining_body_text).
+    """
     headers: Dict[str, str] = {}
     lines = (text or "").splitlines()
     header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject):')
     i = 0
     saw_header = False
     while i < len(lines):
         line = lines[i].rstrip("\r")
         stripped = line.strip()
                 break
             else:
                 break
     body_text = "\n".join(lines[i:]) if i < len(lines) else ""
     return headers, body_text
 # =================== Normalization & Utilities ===================
 def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[str, Any]:
+    """Normalize a single raw record into a structured row."""
     if str(raw.get("type", "")).lower() == "meta":
         return {}
     body_text_raw = raw.get("body_text") or raw.get("text") or ""
+    html_content = raw.get("body_html") or raw.get("html") or ""
     if html_content and not body_text_raw:
         body_text_raw = html_to_text(html_content)
     body_text_raw = ftfy.fix_text(body_text_raw or "")
     subject_text = ""
     from_name = from_email = from_domain = ""
     date_val = raw.get("date") or raw.get("Date") or ""
     if body_text_raw:
         headers, body_only = parse_email_headers(body_text_raw)
         subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
         sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
         date_val = headers.get("Date", "") or date_val
+        # Clean body: NO phone redaction, per your request
         body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
         body_clean = URL_RE.sub(" URL ", body_clean)
         body_clean = re.sub(r"\s+", " ", body_clean).strip()
         body_text = body_clean
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
     else:
         sender = raw.get("from") or raw.get("From") or ""
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
     subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
     if use_langdetect:
         try:
             lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
             lang = "unknown"
     else:
         lang = "unknown"
     iso_date = ""
     if isinstance(date_val, (int, float)):
         try:
             iso_date = ""
     elif isinstance(date_val, str) and date_val:
         iso_date = pd.to_datetime(date_val, utc=True, errors="coerce").isoformat()
     msg_id = raw.get("message_id") or raw.get("Message-ID") or ""
     if not msg_id:
         msg_id = f"gen-{uuid.uuid4().hex}"
     thread_key = subject_norm or (from_email + body_text[:120])
     thread_id = str(pd.util.hash_pandas_object(pd.Series([thread_key], dtype="string")).astype("uint64").iloc[0])
     text_hash = str(pd.util.hash_pandas_object(pd.Series([body_text], dtype="string")).astype("uint64").iloc[0]) if body_text else ""
     return {
         "message_id": str(msg_id),
         "thread_id": thread_id,
     }
 def has_suspect_tag(text: str) -> List[str]:
+    """Return list of corruption/suspicion tags present in text."""
     tags = []
     if not text:
         return tags
     analyzer = SentimentIntensityAnalyzer()
     scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
     df["sentiment_score"] = scores
+    # VADER thresholds: [-1,-0.05), (-0.05,0.05), (0.05,1]
     bins = [-1.01, -0.05, 0.05, 1.01]
     labels = ["negative", "neutral", "positive"]
     df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
     return df
+def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str] = None) -> str:
+    """Email reader HTML with highlighted query terms and visible tags."""
     subject = (row.get("subject") or "").strip()
+    body = (row.get("body_text") or "").strip()
     from_email = row.get("from_email") or ""
+    date = row.get("date") or ""
+    tags = row.get("tags") or []
     sentiment = row.get("sentiment") or "(unknown)"
     def hi(text: str) -> str:
         return out
     subject_h = hi(subject)
+    body_h = hi(body)
+    # Basic RTL detection for Hebrew/Arabic chars → add dir="rtl"
     rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
     dir_attr = ' dir="rtl"' if rtl else ""
     body_html = body_h.replace("\n", "<br/>")
     tag_html = ""
     out = {}
     uniq = np.unique(labels)
     for c in uniq:
+        mask = labels == c
         if mask.sum() == 0:
             out[int(c)] = f"cluster_{c}"
             continue
+        # mean TF-IDF per feature inside cluster
         mean_vec = X[mask].mean(axis=0).A1
         if mean_vec.size == 0:
             out[int(c)] = f"cluster_{c}"
     return out
 def auto_k_rule(n_docs: int) -> int:
+    # Sublinear scaling; keeps clusters between ~120 and 600 for big corpora
     return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
 # =================== Gradio UI ===================
             )
         with gr.Row():
             date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
+            date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
     with gr.Row():
         run_btn = gr.Button("Process", variant="primary")
     with gr.Row():
         cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
+        domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
     gr.Markdown("### Search")
     with gr.Row():
     email_view = gr.HTML(label="Reader")
     # State
+    state_df = gr.State()          # full dataframe
+    state_vec = gr.State()         # TfidfVectorizer
+    state_X_reduced = gr.State()   # np.ndarray (LSA normalized) or None
+    state_index = gr.State()       # Faiss index or sklearn NN
+    state_term_names = gr.State()  # dict cluster_id -> label
+    state_query_terms = gr.State() # last search terms list
+    state_use_lsa = gr.State()
+    state_use_faiss = gr.State()
+    state_svd = gr.State()
+    state_norm = gr.State()
     # -------- IO helpers --------
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
                         recs = [obj]
         return recs
+    def _apply_filters(
+        df: pd.DataFrame,
+        cluster: Optional[str],
+        domain: Optional[str],
+        sentiment: str,
+        tag_value: str,
+        start: str,
+        end: str,
+    ) -> pd.DataFrame:
         out = df
         if cluster and cluster != "(any)":
+            # cluster values like "12 — payment, contract (534)"
             m = re.match(r"^(\d+)\s+—", cluster)
             if m:
                 cid = int(m.group(1))
         if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
             out = out[out["sentiment"].astype(str) == sentiment]
         if tag_value and tag_value != "(any)":
+            # tags is a list; check membership robustly
             out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
+        # date bounds
         if start:
             try:
                 dt = pd.to_datetime(start, utc=True, errors="coerce")
                      use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss):
         if inbox_file is None:
             return ("**Please upload a file.**",
+                    None, None, None, None, None, None, None, None, None, None, None, None, None, None)
         use_lang = not bool(skip_lang)
         recs = _load_json_records(inbox_file.name)
         if not recs:
             return ("**No valid records found.**",
+                    None, None, None, None, None, None, None, None, None, None, None, None, None, None)
+        # Normalize
         normd = []
         for r in tqdm(recs, desc="Normalize", leave=False):
             out = normalize_email_record(r, use_langdetect=use_lang)
         df = pd.DataFrame(normd)
         if df.empty:
             return ("**No usable email records after normalization.**",
+                    None, None, None, None, None, None, None, None, None, None, None, None, None, None)
+        # Deduplicate conservatively
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
+        # Tags (suspect/finance) + Sentiment
         df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
         df = compute_sentiment_column(df)
+        # Texts for modeling
         texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
+        # TF-IDF (sparse CSR float32)
         ngram_range = (1, 2) if use_bigrams else (1, 1)
         vec = TfidfVectorizer(
             analyzer="word",
             sublinear_tf=True,
             dtype=np.float32,
         )
+        X = vec.fit_transform(texts)  # CSR float32
+        # LSA (TruncatedSVD + Normalizer) for stability/quality
         use_lsa = bool(use_lsa)
         X_reduced = None
         svd_obj = None
         norm_obj = None
         if use_lsa:
             svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
+            X_reduced_tmp = svd_obj.fit_transform(X)  # dense (n_docs x lsa_dim)
             norm_obj = Normalizer(copy=False)
             X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
             del X_reduced_tmp
             gc.collect()
+        # KMeans clustering
         if bool(auto_k):
             k = auto_k_rule(X.shape[0])
         else:
         labels = kmeans.fit_predict(X_reduced if use_lsa else X)
         df["cluster_id"] = labels
+        # Name clusters by top terms (use original TF-IDF for interpretability)
         term_names = top_terms_per_cluster(X, labels, vec, topn=6)
         df["cluster_name"] = [term_names[int(c)] for c in labels]
+        # Build search index
         use_faiss = bool(use_faiss) and FAISS_OK
         index_obj = None
         if use_faiss and use_lsa:
+            # cosine ≈ inner product on normalized vectors
+            d = X_reduced.shape[1]
             index_obj = faiss.IndexFlatIP(d)
             index_obj.add(X_reduced)
         else:
+            # fallback to brute-force cosine on TF-IDF or reduced vectors
             nn = NearestNeighbors(metric="cosine", algorithm="brute")
             nn.fit(X_reduced if use_lsa else X)
             index_obj = nn
+        # Summaries
         cluster_counts = (
             df.groupby(["cluster_id", "cluster_name"]).size()
               .reset_index(name="count")
               .sort_values("count", ascending=False)
               .head(500)
         )
+        # For dropdown labels: "id — label (count)"
         cluster_counts["label"] = cluster_counts.apply(
             lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
         )
         )
         domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
+        # Results preview default (latest 500 by date if available)
         if "date" in df.columns and df["date"].notna().any():
             show_df = df.copy()
+            # coerce to datetime for sort
             show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
             show_df = show_df.sort_values("_dt", ascending=False).drop(columns=["_dt"])
         else:
         gc.collect()
+        # Use gr.update to set dropdown choices + default values safely
         cluster_update = gr.update(choices=cluster_choices, value="(any)")
+        domain_update = gr.update(choices=domain_choices, value="(any)")
+        return (
+            status_md,
+            cluster_counts, domain_counts,
+            out_table,
+            df, vec, (X_reduced if use_lsa else None), index_obj, term_names,
+            use_lsa, (use_faiss and use_lsa and FAISS_OK),
+            cluster_update, domain_update,
+            svd_obj, norm_obj
+        )
     (run_btn.click)(
         process_file,
             return pd.DataFrame()
         filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
         cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
+        # default: sort by date desc if possible
         if "date" in filt.columns and filt["date"].notna().any():
             tmp = filt.copy()
             tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
             outputs=[results_df]
         )
+    # Safer reset: set dropdowns to None (always valid), others to defaults
     reset_btn.click(
+        lambda: [None, None, "(any)", "(any)", "", ""],
         inputs=[],
         outputs=[cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]
     ).then(
     def _tokenize_query(q: str) -> List[str]:
         if not q:
             return []
+        # split on spaces, keep simple tokens; short stop words aren’t filtered to keep behavior explicit
         parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
+        # dedupe while preserving order
         seen, out = set(), []
         for p in parts:
             if p.lower() not in seen:
                 out.append(p)
                 seen.add(p.lower())
+        return out[:8]  # limit highlights for performance
     def _project_query_to_lsa(q_vec, svd_obj, norm_obj) -> Optional[np.ndarray]:
         try:
+            q_red = svd_obj.transform(q_vec)   # (1, lsa_dim)
+            q_red = norm_obj.transform(q_red)  # normalize
             return q_red.astype(np.float32)
         except Exception:
             return None
         if (not q) or (df is None) or (vec is None) or (index_obj is None):
             return pd.DataFrame(), []
         q_terms = _tokenize_query(q)
+        # Vectorize the query
         q_vec = vec.transform([q])
         # Decide which space the index uses and project accordingly
         outputs=[results_df, state_query_terms]
     )
+    def on_row_select(evt: gr.SelectData, table: pd.DataFrame, df: pd.DataFrame, term_names: Dict[int, str], query_terms: Optional[List[str]]):
         try:
             row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
         except Exception:
             row_idx = evt.index if hasattr(evt, "index") else None
         if row_idx is None or table is None or len(table) == 0 or df is None or len(df) == 0:
             return ""
+        # Get identifying columns from the table row to map back to original df row
         sel = table.iloc[row_idx]
         subj = sel.get("subject", None)
+        frm = sel.get("from_email", None)
         dstr = sel.get("date", None)
+        # match in original df
         cand = df
         if subj is not None:
             cand = cand[cand["subject"] == subj]
         if dstr is not None:
             cand = cand[cand["date"] == dstr]
         if len(cand) == 0:
+            cand = df[df["subject"] == sel.get("subject", "")]
         if len(cand) == 0:
             return ""
         row = cand.iloc[0]