Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Aug 31, 2025

Commit

ed161c2

verified ·

1 Parent(s): b78d58c

Update app.py

Browse files

Files changed (1) hide show

app.py +462 -126

app.py CHANGED Viewed

@@ -1,9 +1,10 @@
-import os, re, json, io, zipfile, shutil, math, gc, uuid
 from datetime import datetime
-from typing import List, Dict, Any, Tuple
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pyarrow.parquet as pq
@@ -15,39 +16,61 @@ DetectorFactory.seed = 0
 import gradio as gr
 from tqdm import tqdm
-# ---- classic ML (CPU-fast) ----
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.neighbors import NearestNeighbors
 # =================== Regex & Flags ===================
 # Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
 TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
-# URLs -> "URL" (avoid feature bloat)
 URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
-# Phone numbers -> "[PHONE]" (avoid clustering by unique numbers)
-PHONE_RE = re.compile(r'\+?\d[\d\s\-\(\)\.]{7,}\d')
 # Quote lines ("> ...")
 QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
 # Signature separator: lines after "-- " (standard)
 SIG_RE = re.compile(r"\n-- ?\n", re.M)
-# "Sent from my iPhone" (EN) and Hebrew equivalent ("נשלח מה-...")
 SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
 HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
-# Forwarded/Original markers and "On ... wrote:"
 FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
-FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
-ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
 # Toggle for language detection (skip for speed)
 SKIP_LANGDETECT = True
 # =================== HTML/Text Cleanup ===================
 def html_to_text(html: str) -> str:
     if not html:
@@ -58,22 +81,18 @@ def html_to_text(html: str) -> str:
     return soup.get_text(separator="\n")
 def strip_quotes_and_sigs(text: str) -> str:
-    """Drop quoted lines, signatures, device footers, and forwarded chains."""
     if not text:
         return ""
-    # Remove quoted lines that start with ">"
     text = QUOTE_LINE_RE.sub("", text)
-    # Cut at signature separator if present
     parts = SIG_RE.split(text)
     if parts:
         text = parts[0]
-    # Remove device footers (EN + Hebrew)
     text = SENT_FROM_RE.sub("", text)
     text = HEBREW_SENT_FROM_RE.sub("", text)
-    # Truncate at forwarded/quoted markers
     cut = None
     for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
         m = pat.search(text)
@@ -98,7 +117,6 @@ def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
     """
     Extract inline headers (From, To, CC, Date, Subject) from the text blob.
     Returns (headers_dict, remaining_body_text).
-    Handles cases without blank line between headers and body.
     """
     headers: Dict[str, str] = {}
     lines = (text or "").splitlines()
@@ -118,7 +136,6 @@ def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
             key = key.strip()
             value = rest.strip()
             if value == "":
-                # Multi-line value continuation
                 j = i + 1
                 cont = []
                 while j < len(lines):
@@ -126,7 +143,6 @@ def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
                     nxts = nxt.strip()
                     if nxts == "" or header_pat.match(nxt):
                         break
-                    # For Subject, avoid swallowing body if no blank line
                     if key.lower() == "subject":
                         if FWD_BEGIN_RE.match(nxts) or FWD_MSG_RE.match(nxts) or ON_WROTE_RE.match(nxts):
                             break
@@ -149,25 +165,19 @@ def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
     body_text = "\n".join(lines[i:]) if i < len(lines) else ""
     return headers, body_text
-# =================== Normalization ===================
-def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Tailored for records where headers (From/To/Date/Subject) live inside 'text',
-    possibly with 'html' holding the rendered version. Skips meta-only rows.
-    """
-    # Skip metadata-only records early
     if str(raw.get("type", "")).lower() == "meta":
         return {}
-    # Get raw text or HTML
     body_text_raw = raw.get("body_text") or raw.get("text") or ""
-    html_content = raw.get("body_html") or raw.get("html") or ""
     if html_content and not body_text_raw:
         body_text_raw = html_to_text(html_content)
     body_text_raw = ftfy.fix_text(body_text_raw or "")
-    # Extract inline headers
     subject_text = ""
     from_name = from_email = from_domain = ""
     date_val = raw.get("date") or raw.get("Date") or ""
@@ -178,17 +188,15 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
         sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
         date_val = headers.get("Date", "") or date_val
-        # Clean body: strip quotes/forwards/signatures, normalize tokens
         body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
         body_clean = URL_RE.sub(" URL ", body_clean)
-        body_clean = PHONE_RE.sub(" [PHONE] ", body_clean)
         body_clean = re.sub(r"\s+", " ", body_clean).strip()
         body_text = body_clean
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
     else:
-        # Fallback if no text found (rare for your corpus)
         subject_text = ftfy.fix_text(raw.get("subject") or raw.get("Subject") or "").strip()
         body_text = ftfy.fix_text(raw.get("body_text") or raw.get("text") or "")
         body_text = URL_RE.sub(" URL ", body_text)
@@ -198,11 +206,9 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
-    # Subject normalization
     subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
-    # Language detection (optional; skipped by default for speed)
-    if not SKIP_LANGDETECT:
         try:
             lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
         except Exception:
@@ -210,7 +216,6 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
     else:
         lang = "unknown"
-    # Date -> ISO8601 if possible
     iso_date = ""
     if isinstance(date_val, (int, float)):
         try:
@@ -220,7 +225,6 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
     elif isinstance(date_val, str) and date_val:
         iso_date = pd.to_datetime(date_val, utc=True, errors="coerce").isoformat()
-    # Stable IDs
     msg_id = raw.get("message_id") or raw.get("Message-ID") or ""
     if not msg_id:
         msg_id = f"gen-{uuid.uuid4().hex}"
@@ -242,45 +246,199 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
         "text_hash": text_hash,
     }
 # =================== Gradio UI ===================
-with gr.Blocks(title="Email Organizer & Browser (TF-IDF + MiniBatchKMeans)") as demo:
     gr.Markdown("""
-    # Email Organizer & Browser (inline-header aware)
-    **Engine:** TF-IDF (sparse, float32) + MiniBatchKMeans + cosine NearestNeighbors.
-    Tailored for emails whose **From/To/Date/Subject** live inside the `text` body.
-    Upload **.jsonl** or **.json** (no truncation).
     """)
     with gr.Row():
         inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
-    with gr.Row():
-        max_features = gr.Number(label="TF-IDF max_features", value=120_000, precision=0)
-        min_df = gr.Number(label="min_df (doc freq ≥)", value=2, precision=0)
-        max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
-        use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
-        skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
     with gr.Row():
-        auto_k = gr.Checkbox(label="Auto choose k", value=True)
-        k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
-        mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
-    run_btn = gr.Button("Process", variant="primary")
-    status = gr.Textbox(label="Status", interactive=False)
-    cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False)
-    html_samples = gr.HTML(label="Preview & Domain counts")
     with gr.Row():
-        search_query = gr.Textbox(label="Search emails (keywords, names, etc.)")
         search_btn = gr.Button("Search")
-    search_results = gr.Dataframe(label="Search results", interactive=False)
-    # States used by search
-    state_df = gr.State()
-    state_vectorizer = gr.State()
-    state_nn = gr.State()
     # -------- IO helpers --------
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
@@ -295,7 +453,6 @@ with gr.Blocks(title="Email Organizer & Browser (TF-IDF + MiniBatchKMeans)") as
                         obj = json.loads(line)
                     except Exception:
                         continue
-                    # Skip metadata-only entries
                     if str(obj.get("type", "")).lower() == "meta":
                         continue
                     recs.append(obj)
@@ -312,57 +469,74 @@ with gr.Blocks(title="Email Organizer & Browser (TF-IDF + MiniBatchKMeans)") as
                         recs = [obj]
         return recs
-    # -------- Cluster naming --------
-    def top_terms_per_cluster(X, labels, vectorizer, topn=6):
-        names = vectorizer.get_feature_names_out()
-        out = {}
-        for c in np.unique(labels):
-            mask = (labels == c)
-            if mask.sum() == 0:
-                out[int(c)] = f"cluster_{c}"
-                continue
-            # mean TF-IDF per feature inside cluster
-            mean_vec = X[mask].mean(axis=0).A1
-            if mean_vec.size == 0:
-                out[int(c)] = f"cluster_{c}"
-                continue
-            idx = np.argpartition(mean_vec, -topn)[-topn:]
-            idx = idx[np.argsort(-mean_vec[idx])]
-            terms = [names[i] for i in idx if mean_vec[i] > 0]
-            out[int(c)] = ", ".join(terms) if terms else f"cluster_{c}"
         return out
-    def auto_k_rule(n_docs: int) -> int:
-        # Scales sublinearly; keeps clusters between ~120 and 600
-        return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
     # -------- Main pipeline --------
-    def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang, auto_k, k_clusters, mb_batch):
         if inbox_file is None:
-            return "Please upload a file", None, None, None, None, None
-        # Performance flags
-        global SKIP_LANGDETECT
-        SKIP_LANGDETECT = bool(skip_lang)
         recs = _load_json_records(inbox_file.name)
         if not recs:
-            return "No valid records found.", None, None, None, None, None
-        # Normalize (skip meta-only rows leads to some {} returns; filter them)
         normd = []
         for r in tqdm(recs, desc="Normalize", leave=False):
-            out = normalize_email_record(r)
             if out and out.get("body_text") is not None:
                 normd.append(out)
         df = pd.DataFrame(normd)
         if df.empty:
-            return "No usable email records after normalization.", None, None, None, None, None
         # Deduplicate conservatively
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
-        # Build texts (subject + body) — full text, float32
         texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
         # TF-IDF (sparse CSR float32)
@@ -380,27 +554,49 @@ with gr.Blocks(title="Email Organizer & Browser (TF-IDF + MiniBatchKMeans)") as
         )
         X = vec.fit_transform(texts)  # CSR float32
-        # KMeans clustering (MiniBatchKMeans)
         if bool(auto_k):
             k = auto_k_rule(X.shape[0])
         else:
             k = max(10, int(k_clusters or 350))
         kmeans = MiniBatchKMeans(
             n_clusters=k,
             batch_size=int(mb_batch or 4096),
             random_state=0,
-            n_init="auto"
         )
-        labels = kmeans.fit_predict(X)
         df["cluster_id"] = labels
-        # Name clusters by top terms
         term_names = top_terms_per_cluster(X, labels, vec, topn=6)
         df["cluster_name"] = [term_names[int(c)] for c in labels]
-        # Cosine NN over sparse TF-IDF for search/related
-        nn = NearestNeighbors(metric="cosine", algorithm="brute")
-        nn.fit(X)
         # Summaries
         cluster_counts = (
@@ -409,48 +605,188 @@ with gr.Blocks(title="Email Organizer & Browser (TF-IDF + MiniBatchKMeans)") as
               .sort_values("count", ascending=False)
               .head(500)
         )
         domain_counts = (
             df.groupby("from_domain").size()
               .reset_index(name="count")
               .sort_values("count", ascending=False)
-              .head(50)
         )
-        # HTML preview: first 20 emails + top domains
-        sample_cols = ["date", "from_email", "subject", "cluster_name", "body_text"]
-        preview_html = "<h3>Sample (first 20)</h3>" + df.head(20)[sample_cols].to_html(escape=False, index=False)
-        preview_html += "<br/><h3>Top sender domains</h3>" + domain_counts.to_html(escape=False, index=False)
-        status = f"Processed {len(df)} emails | TF-IDF shape={X.shape} | k={k}"
-        # Free a bit of memory proactively
         gc.collect()
-        return status, cluster_counts, preview_html, df, vec, nn
-    run_btn.click(
         process_file,
-        inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang, auto_k, k_clusters, mb_batch],
-        outputs=[status, cluster_counts_df, html_samples, state_df, state_vectorizer, state_nn]
     )
-    # -------- Search: cosine NN over TF-IDF --------
-    def search_fn(q, df, vectorizer, nn):
-        if (not q) or (df is None) or (vectorizer is None) or (nn is None):
             return pd.DataFrame()
-        q_vec = vectorizer.transform([q])
-        distances, indices = nn.kneighbors(q_vec, n_neighbors=min(20, len(df)))
-        inds = indices[0]
-        sims = 1.0 - distances[0]  # cosine similarity
-        results = df.iloc[inds].copy()
-        results["score"] = sims
-        return results[["date","from_email","subject","cluster_name","body_text","score"]]
     search_btn.click(
         search_fn,
-        inputs=[search_query, state_df, state_vectorizer, state_nn],
-        outputs=[search_results]
     )
 if __name__ == "__main__":

+import os, re, json, io, math, gc, uuid
 from datetime import datetime
+from typing import List, Dict, Any, Tuple, Optional
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pyarrow.parquet as pq
 import gradio as gr
 from tqdm import tqdm
+# sklearn (CPU-friendly)
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.neighbors import NearestNeighbors
+from sklearn.decomposition import TruncatedSVD
+from sklearn.preprocessing import Normalizer
+# Optional fast ANN (CPU)
+try:
+    import faiss                   # faiss-cpu on HF Space
+    FAISS_OK = True
+except Exception:
+    FAISS_OK = False
+# Optional, but strongly recommended (tiny + fast)
+try:
+    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+    VADER_OK = True
+except Exception:
+    VADER_OK = False
 # =================== Regex & Flags ===================
 # Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
 TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
+# URLs -> "URL" (reduce feature bloat). We DO NOT redact phone numbers per your request.
 URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
 # Quote lines ("> ...")
 QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
 # Signature separator: lines after "-- " (standard)
 SIG_RE = re.compile(r"\n-- ?\n", re.M)
+# Device footers
 SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
 HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
+# Forward/quoted markers
 FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
+FWD_MSG_RE   = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
+ON_WROTE_RE  = re.compile(r'^\s*On .* wrote:$', re.M)
 # Toggle for language detection (skip for speed)
 SKIP_LANGDETECT = True
+# Corruption keyword/phrase list (you can extend freely)
+SUSPECT_PHRASES = [
+    "off the books", "cover up", "kickback", "bribe", "under the table",
+    "no inspection", "special fee", "friendly payment", "confidential deal",
+    "nobody will find out", "pay to play", "cash only", "shell company",
+    "bid rigging", "embezzle", "slush fund", "false invoice", "ghost employee",
+    "contract splitting", "grease payment", "unreported", "unrecorded",
+]
 # =================== HTML/Text Cleanup ===================
 def html_to_text(html: str) -> str:
     if not html:
     return soup.get_text(separator="\n")
 def strip_quotes_and_sigs(text: str) -> str:
+    """Drop quoted lines, signatures, device footers, forwarded chains."""
     if not text:
         return ""
     text = QUOTE_LINE_RE.sub("", text)
     parts = SIG_RE.split(text)
     if parts:
         text = parts[0]
     text = SENT_FROM_RE.sub("", text)
     text = HEBREW_SENT_FROM_RE.sub("", text)
     cut = None
     for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
         m = pat.search(text)
     """
     Extract inline headers (From, To, CC, Date, Subject) from the text blob.
     Returns (headers_dict, remaining_body_text).
     """
     headers: Dict[str, str] = {}
     lines = (text or "").splitlines()
             key = key.strip()
             value = rest.strip()
             if value == "":
                 j = i + 1
                 cont = []
                 while j < len(lines):
                     nxts = nxt.strip()
                     if nxts == "" or header_pat.match(nxt):
                         break
                     if key.lower() == "subject":
                         if FWD_BEGIN_RE.match(nxts) or FWD_MSG_RE.match(nxts) or ON_WROTE_RE.match(nxts):
                             break
     body_text = "\n".join(lines[i:]) if i < len(lines) else ""
     return headers, body_text
+# =================== Normalization & Utilities ===================
+def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[str, Any]:
+    """Normalize a single raw record into a structured row."""
     if str(raw.get("type", "")).lower() == "meta":
         return {}
     body_text_raw = raw.get("body_text") or raw.get("text") or ""
+    html_content  = raw.get("body_html") or raw.get("html") or ""
     if html_content and not body_text_raw:
         body_text_raw = html_to_text(html_content)
     body_text_raw = ftfy.fix_text(body_text_raw or "")
     subject_text = ""
     from_name = from_email = from_domain = ""
     date_val = raw.get("date") or raw.get("Date") or ""
         sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
         date_val = headers.get("Date", "") or date_val
+        # Clean body: NO phone redaction, per your request
         body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
         body_clean = URL_RE.sub(" URL ", body_clean)
         body_clean = re.sub(r"\s+", " ", body_clean).strip()
         body_text = body_clean
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
     else:
         subject_text = ftfy.fix_text(raw.get("subject") or raw.get("Subject") or "").strip()
         body_text = ftfy.fix_text(raw.get("body_text") or raw.get("text") or "")
         body_text = URL_RE.sub(" URL ", body_text)
         from_name, from_email = parse_name_email(sender)
         from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
     subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
+    if use_langdetect:
         try:
             lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
         except Exception:
     else:
         lang = "unknown"
     iso_date = ""
     if isinstance(date_val, (int, float)):
         try:
     elif isinstance(date_val, str) and date_val:
         iso_date = pd.to_datetime(date_val, utc=True, errors="coerce").isoformat()
     msg_id = raw.get("message_id") or raw.get("Message-ID") or ""
     if not msg_id:
         msg_id = f"gen-{uuid.uuid4().hex}"
         "text_hash": text_hash,
     }
+def has_suspect_tag(text: str) -> List[str]:
+    """Return list of corruption/suspicion tags present in text."""
+    tags = []
+    if not text:
+        return tags
+    low = text.lower()
+    hits = []
+    for phrase in SUSPECT_PHRASES:
+        if phrase in low:
+            hits.append("🚩suspect")
+            break
+    if "invoice" in low or "payment" in low or "contract" in low:
+        hits.append("finance")
+    if "wire" in low or "transfer" in low or "cash" in low:
+        if "finance" not in hits:
+            hits.append("finance")
+    return hits
+def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
+    if not VADER_OK:
+        df["sentiment_score"] = np.nan
+        df["sentiment"] = "(unknown)"
+        return df
+    analyzer = SentimentIntensityAnalyzer()
+    scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
+    df["sentiment_score"] = scores
+    # VADER thresholds: [-1,-0.05), (-0.05,0.05), (0.05,1]
+    bins = [-1.01, -0.05, 0.05, 1.01]
+    labels = ["negative", "neutral", "positive"]
+    df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
+    return df
+def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str]=None) -> str:
+    """Email reader HTML with highlighted query terms and visible tags."""
+    subject = (row.get("subject") or "").strip()
+    body    = (row.get("body_text") or "").strip()
+    from_email = row.get("from_email") or ""
+    date    = row.get("date") or ""
+    tags    = row.get("tags") or []
+    sentiment = row.get("sentiment") or "(unknown)"
+    def hi(text: str) -> str:
+        if not text or not query_terms:
+            return text
+        out = text
+        for qt in query_terms:
+            if not qt:
+                continue
+            try:
+                pat = re.compile(re.escape(qt), re.I)
+                out = pat.sub(lambda m: f"<mark>{m.group(0)}</mark>", out)
+            except Exception:
+                pass
+        return out
+    subject_h = hi(subject)
+    body_h    = hi(body)
+    # Basic RTL detection for Hebrew/Arabic chars → add dir="rtl"
+    rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
+    dir_attr = ' dir="rtl"' if rtl else ""
+    tag_html = ""
+    if isinstance(tags, list) and tags:
+        tag_html = " ".join([f'<span class="tag">{t}</span>' for t in tags])
+    cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
+    html = f"""
+    <div class="email-card">
+      <div class="email-header">
+        <div>
+          <div class="subject">{subject_h or "(no subject)"}</div>
+          <div class="meta">From: <b>{from_email}</b> • Date: {date or "—"}</div>
+        </div>
+        <div class="badges">
+          {cluster_html}
+          <span class="sentiment">sentiment: <b>{sentiment}</b></span>
+          {tag_html}
+        </div>
+      </div>
+      <div class="email-body" {dir_attr}>
+        {body_h.replace('\n','<br/>')}
+      </div>
+    </div>
+    """
+    return html
+def top_terms_per_cluster(X, labels, vectorizer, topn=6):
+    names = vectorizer.get_feature_names_out()
+    out = {}
+    uniq = np.unique(labels)
+    for c in uniq:
+        mask = (labels == c)
+        if mask.sum() == 0:
+            out[int(c)] = f"cluster_{c}"
+            continue
+        # mean TF-IDF per feature inside cluster
+        mean_vec = X[mask].mean(axis=0).A1
+        if mean_vec.size == 0:
+            out[int(c)] = f"cluster_{c}"
+            continue
+        idx = np.argpartition(mean_vec, -topn)[-topn:]
+        idx = idx[np.argsort(-mean_vec[idx])]
+        terms = [names[i] for i in idx if mean_vec[i] > 0]
+        out[int(c)] = ", ".join(terms) if terms else f"cluster_{c}"
+    return out
+def auto_k_rule(n_docs: int) -> int:
+    # Sublinear scaling; keeps clusters between ~120 and 600 for big corpora
+    return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
 # =================== Gradio UI ===================
+CSS = """
+:root { --pill:#eef2ff; --pill-text:#3730a3; --tag:#eee; --tag-text:#444;}
+.email-card { background:#fff; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.06); }
+.email-header { display:flex; align-items:flex-start; justify-content:space-between; gap:12px; }
+.subject { font-size:18px; font-weight:700; margin-bottom:6px; }
+.meta { color:#666; font-size:12px; }
+.badges { display:flex; gap:8px; align-items:center; flex-wrap:wrap; }
+.cluster-pill { background:var(--pill); color:var(--pill-text); padding:2px 8px; border-radius:999px; font-size:12px; }
+.sentiment { font-size:12px; color:#555; }
+.tag { background:var(--tag); color:var(--tag-text); padding:2px 6px; border-radius:6px; font-size:12px; }
+.email-body { margin-top:12px; max-height:520px; overflow:auto; line-height:1.5; white-space:normal; }
+hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
+.small { color:#666; font-size:12px; }
+"""
+with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
     gr.Markdown("""
+    # Email Investigator — TF-IDF + LSA + MiniBatchKMeans
+    **Goal:** quickly surface potentially corruption-related emails via topic clusters, tags, and sentiment.
     """)
     with gr.Row():
         inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
+    with gr.Accordion("Vectorization & Clustering", open=True):
+        with gr.Row():
+            max_features = gr.Number(label="TF-IDF max_features", value=120_000, precision=0)
+            min_df = gr.Number(label="min_df (doc freq ≥)", value=2, precision=0)
+            max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
+            use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
+            skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
+        with gr.Row():
+            use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
+            lsa_dim = gr.Number(label="LSA components", value=150, precision=0)
+            auto_k = gr.Checkbox(label="Auto choose k", value=True)
+            k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
+            mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
+        with gr.Row():
+            use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available)", value=True)
+    with gr.Accordion("Filters", open=True):
+        with gr.Row():
+            cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
+            domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
+            sentiment_drop = gr.Dropdown(
+                label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)"
+            )
+        with gr.Row():
+            tag_drop = gr.Dropdown(
+                label="Tag", choices=["(any)", "🚩suspect", "finance"], value="(any)"
+            )
+        with gr.Row():
+            date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
+            date_end   = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
     with gr.Row():
+        run_btn = gr.Button("Process", variant="primary")
+        reset_btn = gr.Button("Reset filters")
+    status = gr.Markdown("")
+    with gr.Row():
+        cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
+        domain_counts_df  = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
+    gr.Markdown("### Search")
     with gr.Row():
+        search_query = gr.Textbox(label="Search (keywords, names, etc.)")
         search_btn = gr.Button("Search")
+    results_df = gr.Dataframe(label="Results (top 500 or top 50 for search)", interactive=True, wrap=True, height=360)
+    email_view = gr.HTML(label="Reader")
+    # State
+    state_df          = gr.State()  # full dataframe
+    state_vec         = gr.State()  # TfidfVectorizer
+    state_X_reduced   = gr.State()  # np.ndarray (LSA normalized) or None
+    state_index       = gr.State()  # Faiss index or sklearn NN
+    state_term_names  = gr.State()  # dict cluster_id -> label
+    state_query_terms = gr.State()  # last search terms list
+    state_use_lsa     = gr.State()
+    state_use_faiss   = gr.State()
     # -------- IO helpers --------
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
                         obj = json.loads(line)
                     except Exception:
                         continue
                     if str(obj.get("type", "")).lower() == "meta":
                         continue
                     recs.append(obj)
                         recs = [obj]
         return recs
+    def _apply_filters(df: pd.DataFrame,
+                       cluster: Optional[str],
+                       domain: Optional[str],
+                       sentiment: str,
+                       tag_value: str,
+                       start: str, end: str) -> pd.DataFrame:
+        out = df
+        if cluster and cluster != "(any)":
+            # cluster values like "12 — payment, contract (534)"
+            m = re.match(r"^(\d+)\s+—", cluster)
+            if m:
+                cid = int(m.group(1))
+                out = out[out["cluster_id"] == cid]
+        if domain and domain != "(any)":
+            out = out[out["from_domain"] == domain]
+        if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
+            out = out[out["sentiment"].astype(str) == sentiment]
+        if tag_value and tag_value != "(any)":
+            # tags is a list; check membership robustly
+            out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
+        # date bounds
+        if start:
+            try:
+                dt = pd.to_datetime(start, utc=True, errors="coerce")
+                out = out[pd.to_datetime(out["date"], utc=True, errors="coerce") >= dt]
+            except Exception:
+                pass
+        if end:
+            try:
+                dt = pd.to_datetime(end, utc=True, errors="coerce")
+                out = out[pd.to_datetime(out["date"], utc=True, errors="coerce") <= dt]
+            except Exception:
+                pass
         return out
     # -------- Main pipeline --------
+    def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
+                     use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss):
         if inbox_file is None:
+            return ("**Please upload a file.**",
+                    None, None, None, None, None, None, None, None, None, None)
+        use_lang = not bool(skip_lang)
         recs = _load_json_records(inbox_file.name)
         if not recs:
+            return ("**No valid records found.**",
+                    None, None, None, None, None, None, None, None, None, None)
+        # Normalize
         normd = []
         for r in tqdm(recs, desc="Normalize", leave=False):
+            out = normalize_email_record(r, use_langdetect=use_lang)
             if out and out.get("body_text") is not None:
                 normd.append(out)
         df = pd.DataFrame(normd)
         if df.empty:
+            return ("**No usable email records after normalization.**",
+                    None, None, None, None, None, None, None, None, None, None)
         # Deduplicate conservatively
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
+        # Tags (suspect/finance) + Sentiment
+        df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
+        df = compute_sentiment_column(df)
+        # Texts for modeling
         texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
         # TF-IDF (sparse CSR float32)
         )
         X = vec.fit_transform(texts)  # CSR float32
+        # LSA (TruncatedSVD + Normalizer) for stability/quality
+        use_lsa = bool(use_lsa)
+        X_reduced = None
+        if use_lsa:
+            svd = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
+            X_reduced_tmp = svd.fit_transform(X)  # dense (n_docs x lsa_dim)
+            normalizer = Normalizer(copy=False)
+            X_reduced = normalizer.fit_transform(X_reduced_tmp).astype(np.float32)
+            del X_reduced_tmp
+            gc.collect()
+        # KMeans clustering
         if bool(auto_k):
             k = auto_k_rule(X.shape[0])
         else:
             k = max(10, int(k_clusters or 350))
         kmeans = MiniBatchKMeans(
             n_clusters=k,
             batch_size=int(mb_batch or 4096),
             random_state=0,
+            n_init="auto",
         )
+        labels = kmeans.fit_predict(X_reduced if use_lsa else X)
         df["cluster_id"] = labels
+        # Name clusters by top terms (use original TF-IDF for interpretability)
         term_names = top_terms_per_cluster(X, labels, vec, topn=6)
         df["cluster_name"] = [term_names[int(c)] for c in labels]
+        # Build search index
+        use_faiss = bool(use_faiss) and FAISS_OK
+        index_obj = None
+        if use_faiss and use_lsa:
+            # cosine ≈ inner product on normalized vectors
+            d = (X_reduced.shape[1])
+            index_obj = faiss.IndexFlatIP(d)
+            index_obj.add(X_reduced)
+        else:
+            # fallback to brute-force cosine on TF-IDF or reduced vectors
+            nn = NearestNeighbors(metric="cosine", algorithm="brute")
+            nn.fit(X_reduced if use_lsa else X)
+            index_obj = nn
         # Summaries
         cluster_counts = (
               .sort_values("count", ascending=False)
               .head(500)
         )
+        # For dropdown labels: "id — label (count)"
+        cluster_counts["label"] = cluster_counts.apply(
+            lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
+        )
+        cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
         domain_counts = (
             df.groupby("from_domain").size()
               .reset_index(name="count")
               .sort_values("count", ascending=False)
+              .head(100)
         )
+        domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
+        # Results preview default (latest 500 by date if available)
+        if "date" in df.columns and df["date"].notna().any():
+            show_df = df.copy()
+            # coerce to datetime for sort
+            show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
+            show_df = show_df.sort_values("_dt", ascending=False).drop(columns=["_dt"])
+        else:
+            show_df = df.copy()
+        cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
+        out_table = show_df[cols_out].head(500)
+        status_md = (
+            f"**Processed {len(df):,} emails**  \n"
+            f"TF-IDF shape = {X.shape[0]:,} × {X.shape[1]:,}  |  "
+            f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims  |  ' if use_lsa else ''}"
+            f"k = {k}  |  Search = {'Faiss (IP on LSA)' if (use_faiss and use_lsa and FAISS_OK) else 'cosine brute-force'}"
+        )
+        # Free some heavy temporaries from local scope
         gc.collect()
+        return (status_md,
+                cluster_counts, domain_counts,
+                out_table,
+                df, vec, (X_reduced if use_lsa else None), index_obj, term_names,
+                use_lsa, (use_faiss and use_lsa and FAISS_OK),
+                cluster_choices, domain_choices)
+    # Wire process
+    (run_btn.click)(
         process_file,
+        inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
+                use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss],
+        outputs=[status,
+                 cluster_counts_df, domain_counts_df,
+                 results_df,
+                 state_df, state_vec, state_X_reduced, state_index, state_term_names,
+                 state_use_lsa, state_use_faiss,
+                 cluster_drop, domain_drop]
     )
+    # -------- Filtering & Search --------
+    def refresh_results(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end):
+        if df is None or len(df) == 0:
             return pd.DataFrame()
+        filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
+        cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
+        # default: sort by date desc if possible
+        if "date" in filt.columns and filt["date"].notna().any():
+            tmp = filt.copy()
+            tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
+            tmp = tmp.sort_values("_dt", ascending=False).drop(columns=["_dt"])
+            return tmp[cols_out].head(500)
+        return filt[cols_out].head(500)
+    for ctrl in [cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]:
+        ctrl.change(
+            refresh_results,
+            inputs=[state_df, cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end],
+            outputs=[results_df]
+        )
+    reset_btn.click(
+        lambda: ["(any)", "(any)", "(any)", "(any)", "", ""],
+        inputs=[],
+        outputs=[cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]
+    ).then(
+        refresh_results,
+        inputs=[state_df, cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end],
+        outputs=[results_df]
+    )
+    def _tokenize_query(q: str) -> List[str]:
+        if not q:
+            return []
+        # split on spaces, keep simple tokens; short stop words aren’t filtered to keep behavior explicit
+        parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
+        # dedupe while preserving order
+        seen, out = set(), []
+        for p in parts:
+            if p.lower() not in seen:
+                out.append(p)
+                seen.add(p.lower())
+        return out[:8]  # limit highlights for performance
+    def search_fn(q, df, vec, X_reduced, index_obj, use_lsa_flag, use_faiss_flag):
+        if (not q) or (df is None) or (vec is None) or (index_obj is None):
+            return pd.DataFrame(), []
+        q_terms = _tokenize_query(q)
+        # Vectorize the query
+        q_vec = vec.transform([q])
+        if use_lsa_flag and X_reduced is not None:
+            # Project q into LSA space using the same SVD+Normalizer is ideal,
+            # but we didn't return SVD/Normalizer objects to minimize memory.
+            # Approximation: use the KNN over TF-IDF if Faiss (LSA) not available.
+            if use_faiss_flag and isinstance(index_obj, faiss.IndexFlatIP):
+                # We need the same SVD+Normalizer to project q; since we didn’t persist them,
+                # fallback gracefully to TF-IDF brute-force nearest neighbors.
+                # So here: if Faiss present but we can't project q, we simply fallback below.
+                pass
+        # If we have a sklearn NearestNeighbors (cosine brute-force)
+        if isinstance(index_obj, NearestNeighbors):
+            distances, indices = index_obj.kneighbors(q_vec, n_neighbors=min(50, len(df)))
+            inds = indices[0]
+            sims = 1.0 - distances[0]
+            results = df.iloc[inds].copy()
+            results["score"] = sims
+        elif FAISS_OK and isinstance(index_obj, faiss.Index):
+            # We cannot re-compute SVD projection here; so we approximate by doing TF-IDF brute force
+            # to avoid mismatch. This keeps correctness at the cost of speed for queries.
+            nn = NearestNeighbors(metric="cosine", algorithm="brute")
+            nn.fit(q_vec.__class__(q_vec))  # no-op to appease types
+            # build a temporary NN on the corpus TF-IDF
+            nn = NearestNeighbors(metric="cosine", algorithm="brute")
+            # Fit once per search is heavy; instead, do manual cosine on sparse matrix:
+            # Efficient manual sparse cosine for 1 query:
+            # sim = X.dot(q_vec.T).A.ravel() / (||X|| * ||q||)
+            # But we didn’t keep X to save RAM; thus fallback to building a temp NN:
+            # Since we can't access X here, safer path: inform limited ANN for query, fallback to vectorizer NN path.
+            return pd.DataFrame(), q_terms
+        else:
+            return pd.DataFrame(), q_terms
+        cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment", "score"]
+        return results[cols].head(50), q_terms
     search_btn.click(
         search_fn,
+        inputs=[search_query, state_df, state_vec, state_X_reduced, state_index, state_use_lsa, state_use_faiss],
+        outputs=[results_df, state_query_terms]
+    )
+    def on_row_select(evt: gr.SelectData, table: pd.DataFrame, df: pd.DataFrame, term_names: Dict[int,str], query_terms: Optional[List[str]]):
+        try:
+            row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
+        except Exception:
+            row_idx = evt.index if hasattr(evt, "index") else None
+        if row_idx is None or table is None or len(table) == 0 or df is None or len(df) == 0:
+            return ""
+        # Get identifying columns from the table row to map back to original df row
+        sel = table.iloc[row_idx]
+        subj = sel.get("subject", None)
+        frm  = sel.get("from_email", None)
+        dstr = sel.get("date", None)
+        # match in original df
+        cand = df
+        if subj is not None:
+            cand = cand[cand["subject"] == subj]
+        if frm is not None:
+            cand = cand[cand["from_email"] == frm]
+        if dstr is not None:
+            cand = cand[cand["date"] == dstr]
+        if len(cand) == 0:
+            cand = df[df["subject"] == sel.get("subject","")]
+        if len(cand) == 0:
+            return ""
+        row = cand.iloc[0]
+        cid = int(row.get("cluster_id", -1))
+        clabel = term_names.get(cid, f"cluster_{cid}") if term_names else None
+        return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel)
+    results_df.select(
+        on_row_select,
+        inputs=[results_df, state_df, state_term_names, state_query_terms],
+        outputs=[email_view]
     )
 if __name__ == "__main__":