Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Sep 1, 2025

Commit

6af16b8

verified ·

1 Parent(s): 08fef30

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -110

app.py CHANGED Viewed

@@ -72,7 +72,7 @@ TAXONOMY = {
     "HR/Admin": ["hiring","personnel","payroll","benefits","policy","vacation","pto"],
     "Constituent": ["constituent","concerned citizen","my issue","complaint","community"],
     "Scheduling": ["schedule","meeting","appointment","calendar","invite","availability","reschedule"],
-    "Legal": ["legal","lawsuit","attorney","counsel","privileged","court","subpoena","confidential"],
     "IT/Security": ["password","account security","two-factor","2fa","vpn","verification code","security alert","it support"],
     "Newsletters/Alerts": ["newsletter","daily briefing","news update","unsubscribe","press clip","digest"],
     "Other": [],
@@ -94,38 +94,31 @@ def _bucket_header_bonus(row: pd.Series, bucket: str) -> float:
     if bucket == "IT/Security":
         return 5.0 if is_notification_like(subj, row.get("body_text",""), row.get("from_email",""), fd) else 0.0
     if bucket == "Constituent":
-        # personal mail to public office is a strong hint
         return 3.0 if (fd in PERSONAL_DOMAINS) else 0.0
     if bucket == "Lobbyist":
         return 5.0 if fd in LOBBY_DOMAINS else 0.0
     if bucket == "Legal":
         return 5.0 if (("law" in fd) or (fd in LEGAL_DOMAINS) or ("privileged" in subj.lower())) else 0.0
     if bucket == "Scheduling":
-        # ICS invite or explicit invite subject
         body = (row.get("body_text") or "")
         return 3.0 if (ATTACH_NAME_RE.search(" ".join(row.get("attachments") or [])) or re.search(r"\binvitation\b|\binvite\b", subj, re.I) or re.search(r"\.ics\b", body, re.I)) else 0.0
     return 0.0
-MIN_ROUTE_SCORE = 1.5   # at least ~2 weak signals or one strong
 TIE_MARGIN      = 1.0
 def route_email_row(row: pd.Series) -> str:
     text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
     scores: dict = {b: 0.0 for b in TAXONOMY.keys()}
-    # lexicon points
     for b, terms in TAXONOMY.items():
-        if not terms:
             continue
-        # count unique term hits to avoid over-crediting repeats
         hits = sum(1 for t in terms if t and t.lower() in text)
         scores[b] += float(hits)
-        # strong phrases in your corruption lexicon can hint Lobbyist/Procurement
         if b in ("Lobbyist","Procurement") and any(p in text for p in SUSPECT_PHRASES):
             scores[b] += 1.0
-    # header bonuses
     for b in TAXONOMY.keys():
         scores[b] += _bucket_header_bonus(row, b)
-    # choose
     best_bucket, best = max(scores.items(), key=lambda kv: kv[1])
     second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
     if best < MIN_ROUTE_SCORE or (best - second) < TIE_MARGIN:
@@ -147,35 +140,28 @@ SKIP_LANGDETECT = True
 # ==== Expanded corruption lexicon ====
 SUSPECT_PHRASES = [
-    # core corruption/finance
     "off the books","cover up","kickback","bribe","under the table",
     "no inspection","special fee","friendly payment","confidential deal",
     "nobody will find out","pay to play","cash only","shell company",
     "bid rigging","embezzle","slush fund","false invoice","ghost employee",
     "contract splitting","grease payment","unreported","unrecorded",
-    # secrecy/evasion
     "off the record","just between us","don’t quote me on this","dont quote me on this",
     "we never had this conversation","keep this between us","not ethical","illegal",
     "grey area","gray area","write off","failed investment","they owe it to me",
-    # off-channel comms
     "let’s take this offline","lets take this offline","send to my gmail","send to my yahoo",
     "don’t leave a trail","dont leave a trail","call my cell","text me","don’t text me","dont text me",
     "tell you on the phone","talk in person","come by my office","vpn",
-    # financial secrecy & accounting games
     "tax haven","off-shore account","offshore account","backdate","pull earnings forward",
     "delete this email","no inspection","special fees","wire instructions",
 ]
-# Evasive acronyms / slang (case-insensitive)
 EVASIVE_ACRO_RE = re.compile(r'\b(?:TYOP|LDL|TOL|OTR|TXT|TYL)\b', re.I)
-# Entity regexes
 MONEY_RE   = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
 PHONE_RE   = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
 INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order|wire)\b', re.I)
 COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
 ATTACH_NAME_RE = re.compile(r'\b(agreement|contract|invoice|wire|payment|instructions|accounts?|offshore|tax|statement)\b', re.I)
-# Off-channel patterns (apps / phrases)
 OFFCHANNEL_PATTERNS = [
     r"\bwhatsapp\b", r"\bsignal\b", r"\btelegram\b", r"\bwechat\b",
     r"send to my (gmail|yahoo|protonmail)", r"(call|text) (me|my cell)",
@@ -184,10 +170,8 @@ OFFCHANNEL_PATTERNS = [
 ]
 OFFCHANNEL_RE = re.compile("|".join(OFFCHANNEL_PATTERNS), re.I)
-# Common personal mail domains (used with user-specified trusted org domains)
 PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
-# Newsletter/newswire heuristics
 NEWS_DOMAINS = {"nytimes.com","ft.com","wsj.com","bloomberg.com","reuters.com","theguardian.com","economist.com"}
 def is_news_like(subject: str, body: str, from_domain: str) -> bool:
     s = (subject or "").lower()
@@ -198,13 +182,11 @@ def is_news_like(subject: str, body: str, from_domain: str) -> bool:
     if any(d in fd for d in NEWS_DOMAINS): return True
     return False
-# -------- System/notification heuristics (bucket as cluster -2) --------
 NOTIFY_PATTERNS = [
     r"\bno[-\s]?reply\b", r"do not reply", r"security alert", r"new sign[-\s]?in",
     r"verification code", r"two[-\s]?factor", r"\b2fa\b", r"\botp\b", r"\bcode[:\s]",
     r"itunes connect", r"apple id", r"your google account", r"used (?:a )?new browser",
     r"unable to determine", r"reset your password", r"\balert\b",
-    # bounces / gateways / quarantine
     r"mailer[-\s]?daemon", r"\bpostmaster\b", r"delivery status notification",
     r"undeliverable", r"delivery failure", r"returned mail", r"mail delivery subsystem",
     r"proofpoint", r"mimecast", r"dmarc", r"\bspf\b", r"\bdkim\b", r"quarantine",
@@ -221,7 +203,6 @@ def is_notification_like(subject: str, body: str, from_email: str, from_domain:
         return True
     return False
-# -------- Fast language heuristic (used when skipping langdetect or on failure) --------
 HEB_RE = re.compile(r'[\u0590-\u05FF]')
 AR_RE  = re.compile(r'[\u0600-\u06FF]')
 CYR_RE = re.compile(r'[\u0400-\u04FF]')
@@ -236,7 +217,6 @@ def fast_lang_heuristic(text: str) -> str:
         return "en"
     return "unknown"
-# Optional seeded themes for semi-supervised init (used only when LSA is ON)
 CORR_LEX = {
     "kickback"      : ["kickback","bribe","under the table","gift","cash"],
     "invoice_fraud" : ["false invoice","ghost employee","contract splitting","slush fund","shell company","front company"],
@@ -258,16 +238,12 @@ MONTHS = {
     "january","february","march","april","june","july","august","september",
     "october","november","december"
 }
-# Extra junk/HTML/MIME terms to suppress in labels (expanded)
 STOP_TERMS = {
     "div","span","nbsp","href","src","img","class","style","align","border","cid",
     "content","content-type","multipart","alternative","quoted","printable","utf",
     "windows-1255","iso-8859","us-ascii","html","plain","attachment","filename",
     "type","id","service","person","generated","fyi"
 }
-# NEW: broader stop buckets for labels *and* features
 AUX_STOP = {
     "will","would","should","could","can","cant","cannot","did","do","does","done",
     "have","has","had","having","get","got","make","made","let","need","want",
@@ -288,20 +264,17 @@ TECH_META = {
 ZH_HEADER_STOP = {"发送时间","星期","星期一","星期二","星期三","星期四","星期五","星期六","星期日","转发","主题","收件人","发件人"}
 HE_EXTRA_STOP = {"עם","או"}
-# fold into STOP_TERMS and build a vectorizer stoplist
 STOP_TERMS |= AUX_STOP | CTA_STOP | TECH_META | ZH_HEADER_STOP | HE_EXTRA_STOP
 EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
 YEAR_RE = re.compile(r"^(19|20)\d{2}$")
 NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
 ONE_CHAR_RE = re.compile(r"^.$")
-# ---- NEW junk token guards (base64/hex/trackers/very-long) ----
-LONG_ALNUM_RE   = re.compile(r"^[A-Za-z0-9_-]{24,}$")   # long tracking/b64-ish
-HEXISH_RE       = re.compile(r"^(?:[A-Fa-f0-9]{8,})$")  # long hex blobs
-DIGIT_HEAVY_RE  = re.compile(r"^(?:\D*\d){6,}\D*$")     # too many digits
 UNDERSCORE_HEAVY_RE = re.compile(r"^[A-Za-z0-9]*_[A-Za-z0-9_]*$")
-# This stoplist is used by the CountVectorizer (MUST be list for sklearn)
 STOPWORD_FOR_VEC = sorted(EN_STOP | HE_STOP | STOP_TERMS)
 def _is_junk_term(t: str) -> bool:
@@ -366,7 +339,6 @@ def strip_quotes_and_sigs(text: str) -> str:
             cut = idx if (cut is None or idx < cut) else cut
     if cut is not None:
         text = text[:cut]
-    # extra safety for mobile signatures that sneak through
     text = re.sub(r"\n\s*sent from my .*?$", "", text, flags=re.I|re.M)
     text = re.sub(r"\n\s*(נשלח מה-?iphone).*?$", "", text, flags=re.I|re.M)
     return text.strip()
@@ -439,7 +411,6 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
     if str(raw.get("type", "")).lower() == "meta":
         return {}
-    # attachments (names); accept common schemas
     attach_names = []
     atts = raw.get("attachments") or raw.get("Attachments") or raw.get("files") or []
     if isinstance(atts, list):
@@ -490,7 +461,6 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
     subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
-    # Language (use fast heuristic if skipping or detector fails)
     if use_langdetect:
         try:
             lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
@@ -562,30 +532,23 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
     df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
     return df
-# Visual highlight helpers
 def _compile_highlight_terms(row: pd.Series, extra_terms: List[str]) -> List[str]:
     terms = []
     txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
-    # suspect phrases found in this row
     for p in SUSPECT_PHRASES:
         if p in txt:
             terms.append(p)
-    # entity markers
     if MONEY_RE.search(txt):   terms.append("$")
     if INVOICE_RE.search(txt): terms.append("invoice")
-    # regex-based (keep as literal samples)
     if PHONE_RE.search(row.get("body_text","") or ""): terms.append("phone")
-    # extras from user input
     for t in extra_terms or []:
         t=t.strip()
         if t and t.lower() in txt:
             terms.append(t)
-    # dedupe
     out, seen = [], set()
     for t in terms:
         if t.lower() not in seen:
-            out.append(t)
-            seen.add(t.lower())
     return out[:24]
 def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None,
@@ -603,7 +566,6 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
     hl_terms = []
     if do_highlight:
         hl_terms = (query_terms or []) + _compile_highlight_terms(row, extra_terms or [])
-        # make unique, case-insensitive
         seen=set(); uniq=[]
         for t in hl_terms:
             tl=t.lower()
@@ -668,11 +630,6 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
 # ---------- Lightweight Embedding Utilities (Optional) ----------
 def _load_embeddings(emb_path: str, binary: bool):
-    """
-    Load word vectors with Gensim if available.
-    Accepts word2vec binary (.bin) or text formats (.txt/.vec).
-    Returns (model, dim) or (None, 0) if not available.
-    """
     if not GENSIM_OK or not emb_path or not os.path.exists(emb_path):
         return None, 0
     try:
@@ -682,7 +639,6 @@ def _load_embeddings(emb_path: str, binary: bool):
             kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=False)
         return kv, int(kv.vector_size)
     except Exception:
-        # Attempt GloVe-like with headerless text
         try:
             kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=True)
             return kv, int(kv.vector_size)
@@ -690,10 +646,6 @@ def _load_embeddings(emb_path: str, binary: bool):
             return None, 0
 def _avg_embed_for_text(text: str, kv, dim: int) -> np.ndarray:
-    """
-    Average embeddings over tokens matched by TOKEN_PATTERN.
-    Returns zero vector if nothing matches or kv is None.
-    """
     vec = np.zeros((dim,), dtype=np.float32)
     if not kv or not text:
         return vec
@@ -705,16 +657,12 @@ def _avg_embed_for_text(text: str, kv, dim: int) -> np.ndarray:
             cnt += 1
     if cnt > 0:
         vec /= float(cnt)
-        # L2-normalize
         n = np.linalg.norm(vec)
         if n > 0:
             vec /= n
     return vec
 def _build_doc_embeddings(texts: List[str], kv, dim: int) -> np.ndarray:
-    """
-    Build [n_docs, dim] dense matrix of averaged embeddings.
-    """
     if not kv or dim <= 0:
         return np.zeros((len(texts), 0), dtype=np.float32)
     out = np.zeros((len(texts), dim), dtype=np.float32)
@@ -773,15 +721,8 @@ def cluster_labels_pmi_bigram(
     topn=6,
     subject_alpha=0.75,
     global_ubiq_cut=0.20,
-    subject_min_cov=0.30  # NEW: prefer subject terms that appear in ≥30% of a cluster's subjects
 ):
-    """
-    Improved labeler:
-      - Considers bigrams AND trigrams (PMI vs. global)
-      - Class-TFIDF unigrams with subject coverage boost
-      - Suppresses globally ubiquitous tokens/phrases (appear in >20% docs by default)
-      - NEW: prefers subject terms that occur in ≥30% of cluster subjects (can be tuned via subject_min_cov)
-    """
     import math as _math
     from collections import Counter, defaultdict
     from sklearn.feature_extraction.text import TfidfVectorizer
@@ -792,12 +733,11 @@ def cluster_labels_pmi_bigram(
     def is_junk_token(tok: str) -> bool:
         if _is_junk_term(tok): return True
         tl = tok.lower()
-        if tl.startswith("__"): return True               # our feature flags
         if "@" in tl: return True
         if tl.isascii() and len(tl) <= 2: return True
         if LONG_ALNUM_RE.match(tok) or HEXISH_RE.match(tok) or DIGIT_HEAVY_RE.match(tok): return True
         if len(tok) > 40: return True
-        # strip punctuation-heavy artifacts (URLs already replaced with 'URL')
         if re.search(r"[^\w\-’']", tl): return True
         return False
@@ -808,7 +748,6 @@ def cluster_labels_pmi_bigram(
     def ngrams(toks, n):
         return [" ".join(p) for p in zip(*[toks[i:] for i in range(n)]) if all(not is_junk_token(x) for x in p)]
-    # Compute global doc frequency for tokens, bigrams, trigrams
     glob_df_uni = Counter()
     glob_df_bg  = Counter()
     glob_df_tri = Counter()
@@ -822,23 +761,19 @@ def cluster_labels_pmi_bigram(
     have_subjects = subjects is not None and len(subjects) == len(texts)
-    # Pre-pass: DF stats
     for idx, (txt, c) in enumerate(zip(texts, labels)):
         c = int(c)
         toks = tokenize_clean(txt)
         uni_set = set(toks)
         bg_set  = set(ngrams(toks, 2))
         tri_set = set(ngrams(toks, 3))
-        # DF
         glob_df_uni.update(uni_set)
         glob_df_bg.update(bg_set)
         glob_df_tri.update(tri_set)
-        # Per-cluster counts
         per_c_bg[c].update(bg_set)
         per_c_tri[c].update(tri_set)
         per_c_texts[c].append(" ".join(toks))
         per_c_doc_count[c] += 1
-        # Subject presence
         if have_subjects:
             stoks = tokenize_clean(subjects[idx] or "")
             s_uni = set(stoks)
@@ -849,16 +784,13 @@ def cluster_labels_pmi_bigram(
             per_c_subj_tri_docs[c].update(s_tri)
     N = max(1, len(texts))
-    labels_out = {}
-    # Helper: ubiquity filter
-    def too_ubiquitous(df_count):  # fraction of docs
         return (df_count / float(N)) > float(global_ubiq_cut)
     for c in sorted(set(int(x) for x in labels)):
         n_docs_c = max(1, per_c_doc_count[c])
-        # ===== PMI bigrams & trigrams with subject-coverage boost & ≥30% preference =====
         phrases = []
         for store, glob_df, subj_docs, n in (
             (per_c_bg[c],  glob_df_bg,  per_c_subj_bg_docs[c], 2),
@@ -868,7 +800,7 @@ def cluster_labels_pmi_bigram(
             total_g = sum(glob_df.values()) + 1e-12
             scored = []
             for ng, cnt in store.most_common(3000):
-                if too_ubiquitous(glob_df[ng]):  # skip ubiquitous n-grams
                     continue
                 p_ng_c = cnt / total_c
                 p_ng_g = (glob_df[ng] / total_g)
@@ -877,17 +809,14 @@ def cluster_labels_pmi_bigram(
                     cov = 0.0
                     if have_subjects:
                         cov = subj_docs[ng] / n_docs_c
-                        # add a preference bump if subject coverage ≥ threshold
                         if cov >= subject_min_cov:
-                            score += 0.6  # modest, ensures such terms bubble up
                     score += subject_alpha * cov
                     scored.append((score, cov, ng))
-            # prefer subject-coverage ≥ threshold first, then highest score
             scored.sort(key=lambda x: (x[1] >= subject_min_cov, x[0]), reverse=True)
             take = max(1, topn // (3 if n == 3 else 2))
             phrases.extend([p for _, _, p in scored[:take]])
-        # ===== Class-TFIDF unigrams with subject coverage boost & ≥30% preference =====
         docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
         docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
         corpus = [docs_c[0], docs_bg[0]]
@@ -904,15 +833,13 @@ def cluster_labels_pmi_bigram(
         vocab_index = {t:i for i,t in enumerate(vocab)}
         if have_subjects:
             for tok, cnt_docs in per_c_subj_uni_docs[c].items():
-                if tok in vocab_index and not is_junk_token(tok):
                     i = vocab_index[tok]
                     frac = cnt_docs / n_docs_c
                     subj_cov[i] = frac
                     subj_cov_frac[i] = frac
-        # base + subject alpha
         row_boosted = row + subject_alpha * subj_cov
-        # final score gets a preference bump if subj coverage ≥ threshold (but not a hard filter)
         pref_bump = (subj_cov_frac >= subject_min_cov).astype(row_boosted.dtype) * 0.6
         final = row_boosted + pref_bump
@@ -920,8 +847,8 @@ def cluster_labels_pmi_bigram(
         unis = []
         for i in order:
             tok = vocab[i]
-            if is_junk_token(tok): continue
-            if too_ubiquitous(glob_df_uni.get(tok, 0)):  # suppress ubiquitous tokens
                 continue
             unis.append(tok)
             if len(unis) >= max(0, topn - len(phrases)):
@@ -935,11 +862,9 @@ def cluster_labels_pmi_bigram(
 # =================== Auto-k & merge ===================
 def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
     n = X.shape[0]
-    # NEW: tiny-partition guard
     if n <= 1:
         return 1, {1: 0.0}
     if n < min(ks):
-        # pick a small, reasonable k for tiny n
         k_small = max(2, min(10, n))
         return int(k_small), {int(k_small): 0.0}
@@ -952,7 +877,7 @@ def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
     inertias = []
     for k in ks:
         k = int(k)
-        if n < k:   # extra safety
             break
         km = MiniBatchKMeans(n_clusters=k, batch_size=4096, random_state=0, n_init="auto")
         km.fit(Xs)
@@ -981,7 +906,7 @@ def merge_close_clusters(labels, centers, thresh=0.92):
         while parent[a]!=a: a=parent[a]
         return a
     for i in range(k):
-        for j in range(i+1, k):  # fixed loop bound
             if sim[i,j] >= thresh:
                 pi, pj = find(i), find(j)
                 if pi!=pj: parent[pj]=pi
@@ -1024,7 +949,7 @@ def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVect
         return seeds_red
     return None
-# =================== NEW: cluster stabilizer (merge near-dupes + reassign tiny → big; else NOISE=-3) ===================
 def _centroids_from_labels(X, labels):
     labs = np.asarray(labels, dtype=int)
     uniq = np.unique(labs)
@@ -1038,7 +963,6 @@ def _centroids_from_labels(X, labels):
             if n > 0: v = v / n
             cents[int(c)] = v.astype(np.float32)
         return cents
-    # CSR sparse
     X = X.tocsr()
     for c in uniq:
         rows = np.where(labs == c)[0]
@@ -1054,7 +978,7 @@ def _cosine_sim_to_centroids(vecs, centroids):
     if not centroids:
         return None, None
     keys = list(centroids.keys())
-    C = np.stack([centroids[k] for k in keys], axis=0)  # (k,d)
     if isinstance(vecs, np.ndarray):
         sims = vecs @ C.T
     else:
@@ -1067,8 +991,6 @@ def _cosine_sim_to_centroids(vecs, centroids):
 def stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35):
     labs = np.asarray(labels, dtype=int)
-    # 1) merge very close centroids
     cents = _centroids_from_labels(X_space, labs)
     keys = sorted([k for k in cents.keys() if k >= 0])
     if len(keys) >= 2:
@@ -1090,7 +1012,6 @@ def stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_t
         labs = np.array([merge_map.get(int(c), int(c)) for c in labs], dtype=int)
         cents = _centroids_from_labels(X_space, labs)
-    # 2) reassign tiny clusters to nearest big centroid (else noise -3)
     vc = pd.Series(labs).value_counts()
     big_labs = set(vc[vc >= int(min_size)].index.tolist())
     small_labs = set(vc[vc < int(min_size)].index.tolist())
@@ -1155,31 +1076,27 @@ def compute_context_anomaly(df_in: pd.DataFrame) -> pd.DataFrame:
         df_in["context_anomaly_score"] = 0.0
         return df_in
-    # 1) IsolationForest percentile -> 0–6 (you already computed anomaly_score per partition; lower is “more anomalous” if using score_samples with sign reversed above)
     df = df_in.copy()
     if "anomaly_score" in df.columns:
-        # higher = more anomalous in your current pipeline (you negated score_samples). Convert to percentile per bucket.
         df["_if_pct"] = 0.0
         for bkt, grp in df.groupby("bucket", dropna=False):
             vals = grp["anomaly_score"].astype(float)
             if vals.notna().sum() >= 5:
-                ranks = vals.rank(pct=True, ascending=False)  # top anomaly gets 1.0
                 df.loc[grp.index, "_if_pct"] = ranks.clip(0, 1)
         df["_if_pts"] = (df["_if_pct"] * 6.0).clip(0, 6)
     else:
         df["_if_pts"] = 0.0
-    # 2) Rule violations per bucket (0–2)
     df["_rule_pts"] = 0.0
     low = (df["subject"].fillna("") + " " + df["body_text"].fillna("")).str.lower()
     for bkt, terms in TAXONOMY.items():
         mask = (df["bucket"] == bkt)
-        if not mask.any():
             continue
         if terms:
             has_term = low.str.contains("|".join([re.escape(t.lower()) for t in terms]), regex=True)
             df.loc[mask & (~has_term), "_rule_pts"] += 1.0
-        # header expectation examples:
         if bkt == "Constituent":
             df.loc[mask & (~df["from_domain"].str.lower().isin(PERSONAL_DOMAINS)), "_rule_pts"] += 1.0
         if bkt == "Scheduling":
@@ -1187,8 +1104,6 @@ def compute_context_anomaly(df_in: pd.DataFrame) -> pd.DataFrame:
             df.loc[mask & (~subj.str.contains(r"\bmeeting|invite|schedule|calendar\b", regex=True)), "_rule_pts"] += 1.0
     df["_rule_pts"] = df["_rule_pts"].clip(0, 2)
-    # 3) Corruption heuristics capped to 0–3
     df["_corr_pts"] = df["corruption_score"].fillna(0).clip(0, 3)
     df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
@@ -1522,6 +1437,8 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
     with gr.Row():
         run_btn = gr.Button("Process", variant="primary")
         reset_btn = gr.Button("Reset filters")
     status = gr.Markdown("")
@@ -1561,8 +1478,9 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
     state_dims        = gr.State()
     state_extra_terms = gr.State()
     state_highlight   = gr.State()
-    # -------- IO helpers --------
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
         recs: List[Dict[str, Any]] = []
         if local_path.endswith(".jsonl"):
@@ -1945,7 +1863,13 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
         extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
         extra_terms_lower = [t.lower() for t in extra_terms]
-        recs = _load_json_records(inbox_file.name)
         if not recs:
             return ("**No valid records found.**",
                     None, None, None, None, None,
@@ -2234,6 +2158,38 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
             state_extra_terms, state_highlight,
             bucket_drop,
         ],
     )
     # -------- Filtering & Search --------
@@ -2403,4 +2359,3 @@ with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Survei
 if __name__ == "__main__":
     # Disable SSR to avoid handler arity warnings under server-side rendering
     demo.launch(ssr_mode=False)

     "HR/Admin": ["hiring","personnel","payroll","benefits","policy","vacation","pto"],
     "Constituent": ["constituent","concerned citizen","my issue","complaint","community"],
     "Scheduling": ["schedule","meeting","appointment","calendar","invite","availability","reschedule"],
+    "Legal": ["legal","lawsuit","intake","attorney","counsel","privileged","court","subpoena","confidential"],
     "IT/Security": ["password","account security","two-factor","2fa","vpn","verification code","security alert","it support"],
     "Newsletters/Alerts": ["newsletter","daily briefing","news update","unsubscribe","press clip","digest"],
     "Other": [],
     if bucket == "IT/Security":
         return 5.0 if is_notification_like(subj, row.get("body_text",""), row.get("from_email",""), fd) else 0.0
     if bucket == "Constituent":
         return 3.0 if (fd in PERSONAL_DOMAINS) else 0.0
     if bucket == "Lobbyist":
         return 5.0 if fd in LOBBY_DOMAINS else 0.0
     if bucket == "Legal":
         return 5.0 if (("law" in fd) or (fd in LEGAL_DOMAINS) or ("privileged" in subj.lower())) else 0.0
     if bucket == "Scheduling":
         body = (row.get("body_text") or "")
         return 3.0 if (ATTACH_NAME_RE.search(" ".join(row.get("attachments") or [])) or re.search(r"\binvitation\b|\binvite\b", subj, re.I) or re.search(r"\.ics\b", body, re.I)) else 0.0
     return 0.0
+MIN_ROUTE_SCORE = 1.5
 TIE_MARGIN      = 1.0
 def route_email_row(row: pd.Series) -> str:
     text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
     scores: dict = {b: 0.0 for b in TAXONOMY.keys()}
     for b, terms in TAXONOMY.items():
+        if not terms:
             continue
         hits = sum(1 for t in terms if t and t.lower() in text)
         scores[b] += float(hits)
         if b in ("Lobbyist","Procurement") and any(p in text for p in SUSPECT_PHRASES):
             scores[b] += 1.0
     for b in TAXONOMY.keys():
         scores[b] += _bucket_header_bonus(row, b)
     best_bucket, best = max(scores.items(), key=lambda kv: kv[1])
     second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
     if best < MIN_ROUTE_SCORE or (best - second) < TIE_MARGIN:
 # ==== Expanded corruption lexicon ====
 SUSPECT_PHRASES = [
     "off the books","cover up","kickback","bribe","under the table",
     "no inspection","special fee","friendly payment","confidential deal",
     "nobody will find out","pay to play","cash only","shell company",
     "bid rigging","embezzle","slush fund","false invoice","ghost employee",
     "contract splitting","grease payment","unreported","unrecorded",
     "off the record","just between us","don’t quote me on this","dont quote me on this",
     "we never had this conversation","keep this between us","not ethical","illegal",
     "grey area","gray area","write off","failed investment","they owe it to me",
     "let’s take this offline","lets take this offline","send to my gmail","send to my yahoo",
     "don’t leave a trail","dont leave a trail","call my cell","text me","don’t text me","dont text me",
     "tell you on the phone","talk in person","come by my office","vpn",
     "tax haven","off-shore account","offshore account","backdate","pull earnings forward",
     "delete this email","no inspection","special fees","wire instructions",
 ]
 EVASIVE_ACRO_RE = re.compile(r'\b(?:TYOP|LDL|TOL|OTR|TXT|TYL)\b', re.I)
 MONEY_RE   = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
 PHONE_RE   = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
 INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order|wire)\b', re.I)
 COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
 ATTACH_NAME_RE = re.compile(r'\b(agreement|contract|invoice|wire|payment|instructions|accounts?|offshore|tax|statement)\b', re.I)
 OFFCHANNEL_PATTERNS = [
     r"\bwhatsapp\b", r"\bsignal\b", r"\btelegram\b", r"\bwechat\b",
     r"send to my (gmail|yahoo|protonmail)", r"(call|text) (me|my cell)",
 ]
 OFFCHANNEL_RE = re.compile("|".join(OFFCHANNEL_PATTERNS), re.I)
 PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
 NEWS_DOMAINS = {"nytimes.com","ft.com","wsj.com","bloomberg.com","reuters.com","theguardian.com","economist.com"}
 def is_news_like(subject: str, body: str, from_domain: str) -> bool:
     s = (subject or "").lower()
     if any(d in fd for d in NEWS_DOMAINS): return True
     return False
 NOTIFY_PATTERNS = [
     r"\bno[-\s]?reply\b", r"do not reply", r"security alert", r"new sign[-\s]?in",
     r"verification code", r"two[-\s]?factor", r"\b2fa\b", r"\botp\b", r"\bcode[:\s]",
     r"itunes connect", r"apple id", r"your google account", r"used (?:a )?new browser",
     r"unable to determine", r"reset your password", r"\balert\b",
     r"mailer[-\s]?daemon", r"\bpostmaster\b", r"delivery status notification",
     r"undeliverable", r"delivery failure", r"returned mail", r"mail delivery subsystem",
     r"proofpoint", r"mimecast", r"dmarc", r"\bspf\b", r"\bdkim\b", r"quarantine",
         return True
     return False
 HEB_RE = re.compile(r'[\u0590-\u05FF]')
 AR_RE  = re.compile(r'[\u0600-\u06FF]')
 CYR_RE = re.compile(r'[\u0400-\u04FF]')
         return "en"
     return "unknown"
 CORR_LEX = {
     "kickback"      : ["kickback","bribe","under the table","gift","cash"],
     "invoice_fraud" : ["false invoice","ghost employee","contract splitting","slush fund","shell company","front company"],
     "january","february","march","april","june","july","august","september",
     "october","november","december"
 }
 STOP_TERMS = {
     "div","span","nbsp","href","src","img","class","style","align","border","cid",
     "content","content-type","multipart","alternative","quoted","printable","utf",
     "windows-1255","iso-8859","us-ascii","html","plain","attachment","filename",
     "type","id","service","person","generated","fyi"
 }
 AUX_STOP = {
     "will","would","should","could","can","cant","cannot","did","do","does","done",
     "have","has","had","having","get","got","make","made","let","need","want",
 ZH_HEADER_STOP = {"发送时间","星期","星期一","星期二","星期三","星期四","星期五","星期六","星期日","转发","主题","收件人","发件人"}
 HE_EXTRA_STOP = {"עם","או"}
 STOP_TERMS |= AUX_STOP | CTA_STOP | TECH_META | ZH_HEADER_STOP | HE_EXTRA_STOP
 EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
 YEAR_RE = re.compile(r"^(19|20)\d{2}$")
 NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
 ONE_CHAR_RE = re.compile(r"^.$")
+LONG_ALNUM_RE   = re.compile(r"^[A-Za-z0-9_-]{24,}$")
+HEXISH_RE       = re.compile(r"^(?:[A-Fa-f0-9]{8,})$")
+DIGIT_HEAVY_RE  = re.compile(r"^(?:\D*\d){6,}\D*$")
 UNDERSCORE_HEAVY_RE = re.compile(r"^[A-Za-z0-9]*_[A-Za-z0-9_]*$")
 STOPWORD_FOR_VEC = sorted(EN_STOP | HE_STOP | STOP_TERMS)
 def _is_junk_term(t: str) -> bool:
             cut = idx if (cut is None or idx < cut) else cut
     if cut is not None:
         text = text[:cut]
     text = re.sub(r"\n\s*sent from my .*?$", "", text, flags=re.I|re.M)
     text = re.sub(r"\n\s*(נשלח מה-?iphone).*?$", "", text, flags=re.I|re.M)
     return text.strip()
     if str(raw.get("type", "")).lower() == "meta":
         return {}
     attach_names = []
     atts = raw.get("attachments") or raw.get("Attachments") or raw.get("files") or []
     if isinstance(atts, list):
     subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
     if use_langdetect:
         try:
             lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
     df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
     return df
 def _compile_highlight_terms(row: pd.Series, extra_terms: List[str]) -> List[str]:
     terms = []
     txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
     for p in SUSPECT_PHRASES:
         if p in txt:
             terms.append(p)
     if MONEY_RE.search(txt):   terms.append("$")
     if INVOICE_RE.search(txt): terms.append("invoice")
     if PHONE_RE.search(row.get("body_text","") or ""): terms.append("phone")
     for t in extra_terms or []:
         t=t.strip()
         if t and t.lower() in txt:
             terms.append(t)
     out, seen = [], set()
     for t in terms:
         if t.lower() not in seen:
+            out.append(t); seen.add(t.lower())
     return out[:24]
 def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None,
     hl_terms = []
     if do_highlight:
         hl_terms = (query_terms or []) + _compile_highlight_terms(row, extra_terms or [])
         seen=set(); uniq=[]
         for t in hl_terms:
             tl=t.lower()
 # ---------- Lightweight Embedding Utilities (Optional) ----------
 def _load_embeddings(emb_path: str, binary: bool):
     if not GENSIM_OK or not emb_path or not os.path.exists(emb_path):
         return None, 0
     try:
             kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=False)
         return kv, int(kv.vector_size)
     except Exception:
         try:
             kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=True)
             return kv, int(kv.vector_size)
             return None, 0
 def _avg_embed_for_text(text: str, kv, dim: int) -> np.ndarray:
     vec = np.zeros((dim,), dtype=np.float32)
     if not kv or not text:
         return vec
             cnt += 1
     if cnt > 0:
         vec /= float(cnt)
         n = np.linalg.norm(vec)
         if n > 0:
             vec /= n
     return vec
 def _build_doc_embeddings(texts: List[str], kv, dim: int) -> np.ndarray:
     if not kv or dim <= 0:
         return np.zeros((len(texts), 0), dtype=np.float32)
     out = np.zeros((len(texts), dim), dtype=np.float32)
     topn=6,
     subject_alpha=0.75,
     global_ubiq_cut=0.20,
+    subject_min_cov=0.30
 ):
     import math as _math
     from collections import Counter, defaultdict
     from sklearn.feature_extraction.text import TfidfVectorizer
     def is_junk_token(tok: str) -> bool:
         if _is_junk_term(tok): return True
         tl = tok.lower()
+        if tl.startswith("__"): return True
         if "@" in tl: return True
         if tl.isascii() and len(tl) <= 2: return True
         if LONG_ALNUM_RE.match(tok) or HEXISH_RE.match(tok) or DIGIT_HEAVY_RE.match(tok): return True
         if len(tok) > 40: return True
         if re.search(r"[^\w\-’']", tl): return True
         return False
     def ngrams(toks, n):
         return [" ".join(p) for p in zip(*[toks[i:] for i in range(n)]) if all(not is_junk_token(x) for x in p)]
     glob_df_uni = Counter()
     glob_df_bg  = Counter()
     glob_df_tri = Counter()
     have_subjects = subjects is not None and len(subjects) == len(texts)
     for idx, (txt, c) in enumerate(zip(texts, labels)):
         c = int(c)
         toks = tokenize_clean(txt)
         uni_set = set(toks)
         bg_set  = set(ngrams(toks, 2))
         tri_set = set(ngrams(toks, 3))
         glob_df_uni.update(uni_set)
         glob_df_bg.update(bg_set)
         glob_df_tri.update(tri_set)
         per_c_bg[c].update(bg_set)
         per_c_tri[c].update(tri_set)
         per_c_texts[c].append(" ".join(toks))
         per_c_doc_count[c] += 1
         if have_subjects:
             stoks = tokenize_clean(subjects[idx] or "")
             s_uni = set(stoks)
             per_c_subj_tri_docs[c].update(s_tri)
     N = max(1, len(texts))
+    def too_ubiquitous(df_count):
         return (df_count / float(N)) > float(global_ubiq_cut)
+    labels_out = {}
     for c in sorted(set(int(x) for x in labels)):
         n_docs_c = max(1, per_c_doc_count[c])
         phrases = []
         for store, glob_df, subj_docs, n in (
             (per_c_bg[c],  glob_df_bg,  per_c_subj_bg_docs[c], 2),
             total_g = sum(glob_df.values()) + 1e-12
             scored = []
             for ng, cnt in store.most_common(3000):
+                if too_ubiquitous(glob_df[ng]):
                     continue
                 p_ng_c = cnt / total_c
                 p_ng_g = (glob_df[ng] / total_g)
                     cov = 0.0
                     if have_subjects:
                         cov = subj_docs[ng] / n_docs_c
                         if cov >= subject_min_cov:
+                            score += 0.6
                     score += subject_alpha * cov
                     scored.append((score, cov, ng))
             scored.sort(key=lambda x: (x[1] >= subject_min_cov, x[0]), reverse=True)
             take = max(1, topn // (3 if n == 3 else 2))
             phrases.extend([p for _, _, p in scored[:take]])
         docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
         docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
         corpus = [docs_c[0], docs_bg[0]]
         vocab_index = {t:i for i,t in enumerate(vocab)}
         if have_subjects:
             for tok, cnt_docs in per_c_subj_uni_docs[c].items():
+                if tok in vocab_index and not _is_junk_term(tok):
                     i = vocab_index[tok]
                     frac = cnt_docs / n_docs_c
                     subj_cov[i] = frac
                     subj_cov_frac[i] = frac
         row_boosted = row + subject_alpha * subj_cov
         pref_bump = (subj_cov_frac >= subject_min_cov).astype(row_boosted.dtype) * 0.6
         final = row_boosted + pref_bump
         unis = []
         for i in order:
             tok = vocab[i]
+            if _is_junk_term(tok): continue
+            if too_ubiquitous(glob_df_uni.get(tok, 0)):
                 continue
             unis.append(tok)
             if len(unis) >= max(0, topn - len(phrases)):
 # =================== Auto-k & merge ===================
 def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
     n = X.shape[0]
     if n <= 1:
         return 1, {1: 0.0}
     if n < min(ks):
         k_small = max(2, min(10, n))
         return int(k_small), {int(k_small): 0.0}
     inertias = []
     for k in ks:
         k = int(k)
+        if n < k:
             break
         km = MiniBatchKMeans(n_clusters=k, batch_size=4096, random_state=0, n_init="auto")
         km.fit(Xs)
         while parent[a]!=a: a=parent[a]
         return a
     for i in range(k):
+        for j in range(i+1, k):
             if sim[i,j] >= thresh:
                 pi, pj = find(i), find(j)
                 if pi!=pj: parent[pj]=pi
         return seeds_red
     return None
+# =================== NEW: cluster stabilizer ===================
 def _centroids_from_labels(X, labels):
     labs = np.asarray(labels, dtype=int)
     uniq = np.unique(labs)
             if n > 0: v = v / n
             cents[int(c)] = v.astype(np.float32)
         return cents
     X = X.tocsr()
     for c in uniq:
         rows = np.where(labs == c)[0]
     if not centroids:
         return None, None
     keys = list(centroids.keys())
+    C = np.stack([centroids[k] for k in keys], axis=0)
     if isinstance(vecs, np.ndarray):
         sims = vecs @ C.T
     else:
 def stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35):
     labs = np.asarray(labels, dtype=int)
     cents = _centroids_from_labels(X_space, labs)
     keys = sorted([k for k in cents.keys() if k >= 0])
     if len(keys) >= 2:
         labs = np.array([merge_map.get(int(c), int(c)) for c in labs], dtype=int)
         cents = _centroids_from_labels(X_space, labs)
     vc = pd.Series(labs).value_counts()
     big_labs = set(vc[vc >= int(min_size)].index.tolist())
     small_labs = set(vc[vc < int(min_size)].index.tolist())
         df_in["context_anomaly_score"] = 0.0
         return df_in
     df = df_in.copy()
     if "anomaly_score" in df.columns:
         df["_if_pct"] = 0.0
         for bkt, grp in df.groupby("bucket", dropna=False):
             vals = grp["anomaly_score"].astype(float)
             if vals.notna().sum() >= 5:
+                ranks = vals.rank(pct=True, ascending=False)
                 df.loc[grp.index, "_if_pct"] = ranks.clip(0, 1)
         df["_if_pts"] = (df["_if_pct"] * 6.0).clip(0, 6)
     else:
         df["_if_pts"] = 0.0
     df["_rule_pts"] = 0.0
     low = (df["subject"].fillna("") + " " + df["body_text"].fillna("")).str.lower()
     for bkt, terms in TAXONOMY.items():
         mask = (df["bucket"] == bkt)
+        if not mask.any():
             continue
         if terms:
             has_term = low.str.contains("|".join([re.escape(t.lower()) for t in terms]), regex=True)
             df.loc[mask & (~has_term), "_rule_pts"] += 1.0
         if bkt == "Constituent":
             df.loc[mask & (~df["from_domain"].str.lower().isin(PERSONAL_DOMAINS)), "_rule_pts"] += 1.0
         if bkt == "Scheduling":
             df.loc[mask & (~subj.str.contains(r"\bmeeting|invite|schedule|calendar\b", regex=True)), "_rule_pts"] += 1.0
     df["_rule_pts"] = df["_rule_pts"].clip(0, 2)
     df["_corr_pts"] = df["corruption_score"].fillna(0).clip(0, 3)
     df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
     with gr.Row():
         run_btn = gr.Button("Process", variant="primary")
+        # NEW: Update button lets you re-run with same uploaded file & current settings
+        update_btn = gr.Button("Update", variant="secondary")  # NEW: Update
         reset_btn = gr.Button("Reset filters")
     status = gr.Markdown("")
     state_dims        = gr.State()
     state_extra_terms = gr.State()
     state_highlight   = gr.State()
+    state_inbox       = gr.State()   # NEW: keep last uploaded file for Update
+        # -------- IO helpers --------
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
         recs: List[Dict[str, Any]] = []
         if local_path.endswith(".jsonl"):
         extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
         extra_terms_lower = [t.lower() for t in extra_terms]
+        # Handle Gradio file object
+        try:
+            infile_path = inbox_file.name
+        except Exception:
+            infile_path = str(inbox_file) if inbox_file else ""
+        recs = _load_json_records(infile_path)
         if not recs:
             return ("**No valid records found.**",
                     None, None, None, None, None,
             state_extra_terms, state_highlight,
             bucket_drop,
         ],
+    ).then(
+        # remember the uploaded file for future "Update" runs
+        lambda f: f, inputs=[inbox_file], outputs=[state_inbox]
+    )
+    # Keep state_inbox in sync whenever a new file is uploaded
+    inbox_file.change(lambda f: f, inputs=[inbox_file], outputs=[state_inbox])
+    # NEW: Bind Update button — re-run with the last uploaded file + current settings
+    update_btn.click(
+        process_file,
+        inputs=[
+            state_inbox, max_features, min_df, max_df, use_bigrams, skip_lang,
+            use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
+            trusted_domains_in, extra_keywords_in, highlight_toggle,
+            use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
+            per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
+            watchlist_in, min_mentions
+        ],
+        outputs=[
+            status, cluster_counts_df, domain_counts_df, sender_counts_df,
+            actors_df, offhours_df,
+            surv_entities_df, surv_samples_df,
+            results_df,
+            state_df, state_vec, state_X_reduced,
+            state_index, state_term_names,
+            state_use_lsa, state_use_faiss,
+            cluster_drop, domain_drop, sender_drop, lang_drop,
+            state_svd, state_norm, state_dims,
+            state_extra_terms, state_highlight,
+            bucket_drop,
+        ],
     )
     # -------- Filtering & Search --------
 if __name__ == "__main__":
     # Disable SSR to avoid handler arity warnings under server-side rendering
     demo.launch(ssr_mode=False)