Spaces:

wuhp
/

testmail2

Sleeping

File size: 101,010 Bytes

ed161c2
8d8f116
ed161c2
8d8f116
 
 
ed161c2
8d8f116
 
 
 
 
 
 
 
 
 
 
ed161c2
d5cce46
9ef7e16
 
ed161c2
 
d5cce46
 
 
609d65e
6c85b5a
 
609d65e
 
 
 
 
 
 
 
 
 
 
 
 
d4eefe7
 
 
 
 
 
 
d5cce46
ed161c2
 
 
c8241c4
ed161c2
 
 
 
d4eefe7
ed161c2
 
 
 
 
8d8f116
186377a
 
 
 
 
 
 
 
6af16b8
186377a
 
 
 
 
 
 
 
08fef30
186377a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6af16b8
186377a
 
 
 
08fef30
186377a
6af16b8
186377a
 
 
 
 
 
 
 
 
 
 
 
 
b78d58c
 
 
 
 
 
 
 
7a250cb
 
c8241c4
b78d58c
 
d4eefe7
ed161c2
d4eefe7
 
 
 
 
 
 
 
 
 
 
 
 
ed161c2
d4eefe7
ed161c2
d5cce46
 
d4eefe7
d5cce46
d4eefe7
 
 
 
 
 
 
 
 
 
 
d5cce46
f97dfc3
 
 
 
 
 
 
 
 
 
b76588a
 
e12524d
b76588a
c720489
 
 
 
 
b76588a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5cce46
 
 
 
 
 
 
f97dfc3
7a250cb
 
 
 
 
d4eefe7
7a250cb
d4eefe7
7a250cb
 
 
 
 
b76588a
 
 
 
c720489
b76588a
c720489
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a250cb
 
 
 
 
6af16b8
 
 
abfac2f
 
e12524d
c720489
7a250cb
abfac2f
 
 
d4eefe7
 
 
 
abfac2f
 
 
 
 
7a250cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4eefe7
8d8f116
 
 
 
 
 
 
 
 
 
 
b78d58c
 
 
 
c8241c4
b78d58c
 
 
 
 
 
 
 
 
c720489
 
b78d58c
8d8f116
 
 
 
 
 
 
 
 
d4eefe7
 
 
 
 
 
 
 
 
b78d58c
 
 
d4eefe7
b78d58c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d8f116
ed161c2
 
b78d58c
 
c8241c4
d4eefe7
 
 
 
 
 
 
 
 
 
 
b78d58c
7a250cb
b78d58c
 
 
c8241c4
b78d58c
 
d4eefe7
b78d58c
c8241c4
b78d58c
 
 
 
 
d4eefe7
 
c8241c4
b78d58c
 
 
 
c8241c4
b78d58c
 
 
 
 
 
 
 
 
 
 
d4eefe7
c8241c4
b78d58c
c8241c4
ed161c2
b4b271e
 
 
b76588a
b4b271e
b76588a
c8241c4
8d8f116
 
 
 
 
b78d58c
8d8f116
b78d58c
c8241c4
b78d58c
8d8f116
 
c8241c4
b78d58c
 
 
c8241c4
8d8f116
 
 
 
 
 
 
d4eefe7
8d8f116
 
 
d4eefe7
8d8f116
 
 
ed161c2
 
 
 
 
 
 
8336b56
ed161c2
 
8336b56
ed161c2
8336b56
 
d4eefe7
 
8336b56
ed161c2
 
 
 
 
 
 
 
 
 
7a250cb
ed161c2
 
 
d4eefe7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6af16b8
d4eefe7
 
 
 
 
 
ed161c2
7a250cb
ed161c2
7a250cb
 
d4eefe7
ed161c2
 
d4eefe7
 
 
 
 
 
 
 
 
 
ed161c2
d4eefe7
ed161c2
 
d4eefe7
ed161c2
 
 
 
 
 
 
 
 
 
7a250cb
c8241c4
ed161c2
 
2ebeb60
 
d4eefe7
 
 
ed161c2
d4eefe7
ed161c2
d4eefe7
 
 
 
 
ed161c2
 
 
2ebeb60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed161c2
 
609d65e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5cce46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4eefe7
f97dfc3
 
d5cce46
 
abfac2f
 
 
 
 
 
 
 
6af16b8
abfac2f
d5cce46
f97dfc3
 
 
609d65e
 
f97dfc3
 
 
 
6af16b8
b76588a
c720489
abfac2f
 
 
f97dfc3
 
 
609d65e
f97dfc3
 
609d65e
 
f97dfc3
609d65e
 
 
f97dfc3
609d65e
f97dfc3
c720489
 
 
609d65e
c720489
 
f97dfc3
c720489
 
f97dfc3
609d65e
 
 
 
 
 
 
 
c720489
 
 
609d65e
 
 
 
 
 
 
 
 
 
6af16b8
609d65e
f97dfc3
6af16b8
f97dfc3
c720489
609d65e
 
 
 
 
 
 
 
 
6af16b8
609d65e
 
 
 
 
 
 
 
abfac2f
6af16b8
609d65e
abfac2f
 
609d65e
abfac2f
609d65e
f97dfc3
609d65e
f97dfc3
 
 
 
 
 
 
 
c720489
 
abfac2f
 
c720489
 
6af16b8
abfac2f
 
 
 
609d65e
 
abfac2f
 
 
 
609d65e
 
b76588a
6af16b8
 
609d65e
 
 
b76588a
f97dfc3
609d65e
f97dfc3
 
d5cce46
 
d4eefe7
d5cce46
 
6c85b5a
 
 
 
 
 
d5cce46
 
 
 
 
 
 
 
6c85b5a
6af16b8
6c85b5a
d5cce46
 
 
6c85b5a
 
 
 
d5cce46
 
 
 
 
 
6c85b5a
ed161c2
 
 
 
d5cce46
 
 
 
 
 
 
 
 
6af16b8
d5cce46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4eefe7
 
d5cce46
 
 
 
 
6af16b8
abfac2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6af16b8
abfac2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4eefe7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5cce46
 
 
 
 
 
d4eefe7
 
d5cce46
 
 
 
 
 
 
 
 
d4eefe7
 
 
 
 
 
d5cce46
 
186377a
 
 
 
 
 
 
 
 
 
 
6af16b8
186377a
 
 
 
 
 
 
 
 
6af16b8
186377a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08fef30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed161c2
7a250cb
 
ed161c2
7a250cb
 
ed161c2
 
7a250cb
ed161c2
7a250cb
 
 
ed161c2
7a250cb
d4eefe7
ed161c2
 
08fef30
 
d4eefe7
08fef30
 
 
 
8d8f116
 
 
 
ed161c2
 
d5cce46
6c85b5a
ed161c2
 
 
609d65e
6c85b5a
 
ed161c2
 
6c85b5a
d5cce46
08fef30
ed161c2
609d65e
6c85b5a
 
 
609d65e
6c85b5a
ed161c2
d5cce46
d4eefe7
ed161c2
d4eefe7
ed161c2
d4eefe7
 
 
609d65e
6c85b5a
 
609d65e
6c85b5a
 
ed161c2
186377a
d4eefe7
 
 
 
 
08fef30
ed161c2
 
7a250cb
186377a
d4eefe7
08fef30
 
 
 
 
 
 
b4b271e
9ef7e16
ed161c2
6af16b8
 
ed161c2
 
9ef7e16
ed161c2
d4eefe7
7a250cb
609d65e
6c85b5a
8d8f116
d4eefe7
 
 
 
08fef30
 
 
 
 
ed161c2
8d8f116
ed161c2
8d8f116
8336b56
ed161c2
 
08fef30
d4eefe7
 
 
 
 
 
7a250cb
 
 
 
d4eefe7
 
 
6af16b8
8d8f116
6af16b8
b4b271e
 
8d8f116
 
 
b4b271e
 
 
8d8f116
b78d58c
b4b271e
8d8f116
b78d58c
 
 
8d8f116
 
 
 
b78d58c
 
 
 
8d8f116
b78d58c
 
b4b271e
 
c8241c4
 
186377a
c8241c4
 
d4eefe7
 
c8241c4
 
 
 
186377a
c8241c4
ed161c2
186377a
 
ed161c2
186377a
ed161c2
 
 
 
 
d4eefe7
 
 
 
ed161c2
 
 
d4eefe7
 
ed161c2
 
 
 
 
 
 
 
 
 
 
 
abfac2f
 
9ef7e16
 
08fef30
d4eefe7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08fef30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ef7e16
ed161c2
d4eefe7
609d65e
 
08fef30
 
b4b271e
08fef30
 
 
 
 
 
 
 
 
 
 
 
 
b4b271e
08fef30
609d65e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30c3c12
609d65e
30c3c12
 
609d65e
 
6c85b5a
 
 
609d65e
6c85b5a
609d65e
 
 
 
6c85b5a
 
609d65e
 
 
 
6c85b5a
609d65e
 
 
 
 
6c85b5a
609d65e
 
 
 
 
 
 
 
6c85b5a
 
609d65e
 
 
 
 
 
 
 
 
 
6c85b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609d65e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08fef30
609d65e
 
 
 
 
 
 
 
 
 
 
 
 
6c85b5a
 
08fef30
 
 
6c85b5a
 
 
 
08fef30
 
6c85b5a
3e01db4
 
 
 
 
 
 
6c85b5a
 
08fef30
6c85b5a
 
609d65e
 
 
3e01db4
 
 
 
 
609d65e
 
 
08fef30
609d65e
 
 
08fef30
609d65e
 
3e01db4
609d65e
 
 
 
08fef30
609d65e
fe3c81f
 
609d65e
 
 
 
 
 
 
 
 
 
3e01db4
 
 
 
609d65e
 
 
 
 
 
08fef30
609d65e
 
 
08fef30
d4eefe7
 
 
 
6af16b8
 
 
 
 
 
 
b4b271e
3e01db4
08fef30
 
 
 
 
 
 
 
 
b4b271e
b78d58c
 
609d65e
b78d58c
 
8d8f116
b78d58c
3e01db4
08fef30
 
 
 
 
 
 
 
 
b78d58c
b4b271e
ed161c2
 
 
08fef30
186377a
 
 
 
 
 
08fef30
d4eefe7
 
 
 
3e01db4
 
d4eefe7
3e01db4
 
d4eefe7
 
 
 
08fef30
186377a
 
 
f97dfc3
08fef30
3e01db4
 
609d65e
 
 
08fef30
609d65e
186377a
 
 
 
9ef7e16
186377a
 
609d65e
6c85b5a
08fef30
6c85b5a
609d65e
 
08fef30
6c85b5a
609d65e
186377a
3e01db4
 
609d65e
 
6c85b5a
3e01db4
 
 
 
 
 
 
d5cce46
3e01db4
 
609d65e
 
 
 
6c85b5a
3e01db4
 
 
609d65e
 
 
 
6c85b5a
609d65e
3e01db4
 
609d65e
08fef30
3e01db4
08fef30
3e01db4
 
 
 
 
 
 
 
 
 
 
 
 
609d65e
d5cce46
609d65e
abfac2f
 
609d65e
d5cce46
186377a
609d65e
186377a
609d65e
186377a
f97dfc3
6c85b5a
609d65e
08fef30
6c85b5a
186377a
 
 
609d65e
3e01db4
 
 
 
08fef30
3e01db4
 
 
 
 
 
 
 
f97dfc3
08fef30
6c85b5a
08fef30
 
 
 
 
d4eefe7
186377a
9ef7e16
08fef30
 
 
 
 
 
ed161c2
6c85b5a
 
609d65e
 
 
ed161c2
609d65e
 
6c85b5a
609d65e
 
6c85b5a
3e01db4
 
b76588a
08fef30
3e01db4
186377a
 
 
 
 
3e01db4
 
186377a
3e01db4
ed161c2
186377a
 
3e01db4
 
 
ed161c2
3e01db4
 
 
d4eefe7
3e01db4
d4eefe7
 
6c85b5a
3e01db4
 
 
 
 
186377a
3e01db4
 
 
 
 
 
 
 
 
abfac2f
3e01db4
6c85b5a
 
c8241c4
 
08fef30
 
 
 
 
 
 
 
3e01db4
186377a
 
3e01db4
186377a
 
 
c8241c4
8d8f116
08fef30
ed161c2
8d8f116
609d65e
186377a
 
 
 
 
08fef30
3e01db4
 
186377a
08fef30
 
 
 
 
186377a
 
08fef30
 
186377a
609d65e
6af16b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d8f116
 
ed161c2
186377a
3e01db4
 
abfac2f
186377a
abfac2f
d4eefe7
ed161c2
08fef30
186377a
abfac2f
3e01db4
 
 
186377a
 
3e01db4
 
 
ed161c2
186377a
ed161c2
186377a
3e01db4
186377a
abfac2f
3e01db4
 
 
186377a
 
3e01db4
 
 
ed161c2
08fef30
6c85b5a
3e01db4
 
 
abfac2f
609d65e
6c85b5a
3e01db4
 
 
ed161c2
6c85b5a
3e01db4
abfac2f
3e01db4
 
6c85b5a
 
3e01db4
 
6c85b5a
 
3e01db4
 
6c85b5a
 
 
d4eefe7
abfac2f
6c85b5a
 
 
3e01db4
 
 
6c85b5a
 
3e01db4
 
 
186377a
 
3e01db4
 
 
ed161c2
08fef30
6c85b5a
3e01db4
 
6c85b5a
ed161c2
3e01db4
abfac2f
3e01db4
 
 
 
 
 
 
 
 
6c85b5a
ed161c2
609d65e
 
3e01db4
abfac2f
 
186377a
abfac2f
 
3e01db4
ed161c2
3e01db4
 
d4eefe7
3e01db4
 
30c3c12
08fef30
6c85b5a
3e01db4
 
6c85b5a
 
186377a
 
 
 
 
 
609d65e
3e01db4
186377a
3e01db4
 
 
186377a
 
3e01db4
 
 
abfac2f
3e01db4
 
 
 
 
186377a
 
3e01db4
 
 
abfac2f
3e01db4
 
 
 
 
186377a
 
3e01db4
 
 
609d65e
8d8f116
3e01db4
fe3c81f

import os, re, json, io, math, gc, uuid
from datetime import datetime
from typing import List, Dict, Any, Tuple, Optional

import numpy as np
import pandas as pd

import pyarrow as pa
import pyarrow.parquet as pq

from bs4 import BeautifulSoup
import ftfy
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

import gradio as gr
from tqdm import tqdm

# sklearn (CPU-friendly)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer as CharTfidf
from sklearn.cluster import MiniBatchKMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import normalize as sk_normalize
from sklearn.metrics.pairwise import cosine_similarity

# === NEW / UPDATED IMPORTS ===
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from scipy.sparse import csr_matrix
try:
    import hdbscan  # OPTIONAL (pip install hdbscan)
    HDBSCAN_OK = True
except Exception:
    HDBSCAN_OK = False

# Optional tiny/fast word vectors via Gensim (local .txt/.vec/.bin)
try:
    from gensim.models import KeyedVectors  # OPTIONAL
    GENSIM_OK = True
except Exception:
    GENSIM_OK = False

# Optional light anomaly detection
try:
    from sklearn.ensemble import IsolationForest
    ISO_OK = True
except Exception:
    ISO_OK = False

from scipy.sparse import hstack

# Optional fast ANN (CPU)
try:
    import faiss  # faiss-cpu on HF Space
    FAISS_OK = True
except Exception:
    FAISS_OK = False

# Optional tiny sentiment
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    VADER_OK = True
except Exception:
    VADER_OK = False

# ======== STAGE-1 TAXONOMY (Buckets) ========
TAXONOMY = {
    "Lobbyist": ["lobby","lobbyist","pac","influence"],
    "Campaign Finance": ["donation","contribution","fundraiser","pledge","campaign finance","pac"],
    "Procurement": ["contract","tender","rfp","rfq","bid","invoice","vendor","purchase order","po"],
    "HR/Admin": ["hiring","personnel","payroll","benefits","policy","vacation","pto"],
    "Constituent": ["constituent","concerned citizen","my issue","complaint","community"],
    "Scheduling": ["schedule","meeting","appointment","calendar","invite","availability","reschedule"],
    "Legal": ["legal","lawsuit","intake","attorney","counsel","privileged","court","subpoena","confidential"],
    "IT/Security": ["password","account security","two-factor","2fa","vpn","verification code","security alert","it support"],
    "Newsletters/Alerts": ["newsletter","daily briefing","news update","unsubscribe","press clip","digest"],
    "Other": [],
}
# header/domain cues (expand as you learn)
LOBBY_DOMAINS = set()   # e.g., {"acme-lobby.com"}
LEGAL_DOMAINS = set()   # e.g., {"biglaw.com","firmlaw.com"}

def _contains_any(text: str, terms: list) -> bool:
    if not text or not terms: return False
    tl = text.lower()
    return any(t for t in terms if t and t.lower() in tl)

def _bucket_header_bonus(row: pd.Series, bucket: str) -> float:
    fd = (row.get("from_domain") or "").lower()
    subj = (row.get("subject") or "")
    if bucket == "Newsletters/Alerts":
        return 5.0 if is_news_like(subj, row.get("body_text",""), fd) else 0.0
    if bucket == "IT/Security":
        return 5.0 if is_notification_like(subj, row.get("body_text",""), row.get("from_email",""), fd) else 0.0
    if bucket == "Constituent":
        return 3.0 if (fd in PERSONAL_DOMAINS) else 0.0
    if bucket == "Lobbyist":
        return 5.0 if fd in LOBBY_DOMAINS else 0.0
    if bucket == "Legal":
        return 5.0 if (("law" in fd) or (fd in LEGAL_DOMAINS) or ("privileged" in subj.lower())) else 0.0
    if bucket == "Scheduling":
        body = (row.get("body_text") or "")
        return 3.0 if (ATTACH_NAME_RE.search(" ".join(row.get("attachments") or [])) or re.search(r"\binvitation\b|\binvite\b", subj, re.I) or re.search(r"\.ics\b", body, re.I)) else 0.0
    return 0.0

MIN_ROUTE_SCORE = 1.5
TIE_MARGIN      = 1.0

def route_email_row(row: pd.Series) -> str:
    text = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
    scores: dict = {b: 0.0 for b in TAXONOMY.keys()}
    for b, terms in TAXONOMY.items():
        if not terms:
            continue
        hits = sum(1 for t in terms if t and t.lower() in text)
        scores[b] += float(hits)
        if b in ("Lobbyist","Procurement") and any(p in text for p in SUSPECT_PHRASES):
            scores[b] += 1.0
    for b in TAXONOMY.keys():
        scores[b] += _bucket_header_bonus(row, b)
    best_bucket, best = max(scores.items(), key=lambda kv: kv[1])
    second = sorted(scores.values(), reverse=True)[1] if len(scores) > 1 else 0.0
    if best < MIN_ROUTE_SCORE or (best - second) < TIE_MARGIN:
        return "Other"
    return best_bucket

# =================== Regex & Flags ===================
TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
SIG_RE = re.compile(r"\n-- ?\n", re.M)
SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
FWD_MSG_RE   = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
ON_WROTE_RE  = re.compile(r'^\s*On .* wrote:$', re.M)

SKIP_LANGDETECT = True

# ==== Expanded corruption lexicon ====
SUSPECT_PHRASES = [
    "off the books","cover up","kickback","bribe","under the table",
    "no inspection","special fee","friendly payment","confidential deal",
    "nobody will find out","pay to play","cash only","shell company",
    "bid rigging","embezzle","slush fund","false invoice","ghost employee",
    "contract splitting","grease payment","unreported","unrecorded",
    "off the record","just between us","don’t quote me on this","dont quote me on this",
    "we never had this conversation","keep this between us","not ethical","illegal",
    "grey area","gray area","write off","failed investment","they owe it to me",
    "let’s take this offline","lets take this offline","send to my gmail","send to my yahoo",
    "don’t leave a trail","dont leave a trail","call my cell","text me","don’t text me","dont text me",
    "tell you on the phone","talk in person","come by my office","vpn",
    "tax haven","off-shore account","offshore account","backdate","pull earnings forward",
    "delete this email","no inspection","special fees","wire instructions",
]
EVASIVE_ACRO_RE = re.compile(r'\b(?:TYOP|LDL|TOL|OTR|TXT|TYL)\b', re.I)

MONEY_RE   = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
PHONE_RE   = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order|wire)\b', re.I)
COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
ATTACH_NAME_RE = re.compile(r'\b(agreement|contract|invoice|wire|payment|instructions|accounts?|offshore|tax|statement)\b', re.I)

OFFCHANNEL_PATTERNS = [
    r"\bwhatsapp\b", r"\bsignal\b", r"\btelegram\b", r"\bwechat\b",
    r"send to my (gmail|yahoo|protonmail)", r"(call|text) (me|my cell)",
    r"take this offline", r"don.?t (text|email) (me|this)",
    r"\bOTR\b", r"\bTOL\b", r"\bTYOP\b", r"\bLDL\b"
]
OFFCHANNEL_RE = re.compile("|".join(OFFCHANNEL_PATTERNS), re.I)

PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}

NEWS_DOMAINS = {"nytimes.com","ft.com","wsj.com","bloomberg.com","reuters.com","theguardian.com","economist.com"}
def is_news_like(subject: str, body: str, from_domain: str) -> bool:
    s = (subject or "").lower()
    b = (body or "").lower()
    fd = (from_domain or "").lower()
    if "unsubscribe" in b or "manage preferences" in b: return True
    if any(k in s for k in ["daily briefing","morning update","newsletter","top stories"]): return True
    if any(d in fd for d in NEWS_DOMAINS): return True
    return False

NOTIFY_PATTERNS = [
    r"\bno[-\s]?reply\b", r"do not reply", r"security alert", r"new sign[-\s]?in",
    r"verification code", r"two[-\s]?factor", r"\b2fa\b", r"\botp\b", r"\bcode[:\s]",
    r"itunes connect", r"apple id", r"your google account", r"used (?:a )?new browser",
    r"unable to determine", r"reset your password", r"\balert\b",
    r"mailer[-\s]?daemon", r"\bpostmaster\b", r"delivery status notification",
    r"undeliverable", r"delivery failure", r"returned mail", r"mail delivery subsystem",
    r"proofpoint", r"mimecast", r"dmarc", r"\bspf\b", r"\bdkim\b", r"quarantine",
    r"spam digest", r"phishing", r"security gateway", r"mail[-\s]?secure|secure message"
]
NOTIFY_RE = re.compile("|".join(NOTIFY_PATTERNS), re.I)
def is_notification_like(subject: str, body: str, from_email: str, from_domain: str) -> bool:
    s = (subject or "").lower()
    b = (body or "").lower()
    fe = (from_email or "").lower()
    if NOTIFY_RE.search(s) or NOTIFY_RE.search(b):
        return True
    if re.search(r"noreply|no-reply|donotreply", fe):
        return True
    return False

HEB_RE = re.compile(r'[\u0590-\u05FF]')
AR_RE  = re.compile(r'[\u0600-\u06FF]')
CYR_RE = re.compile(r'[\u0400-\u04FF]')
def fast_lang_heuristic(text: str) -> str:
    t = text or ""
    if HEB_RE.search(t): return "he"
    if AR_RE.search(t):  return "ar"
    if CYR_RE.search(t): return "ru"
    letters = sum(ch.isalpha() for ch in t)
    ascii_letters = sum(ch.isascii() and ch.isalpha() for ch in t)
    if letters and ascii_letters / max(1, letters) > 0.85:
        return "en"
    return "unknown"

CORR_LEX = {
    "kickback"      : ["kickback","bribe","under the table","gift","cash"],
    "invoice_fraud" : ["false invoice","ghost employee","contract splitting","slush fund","shell company","front company"],
    "procurement"   : ["bid rigging","tender","vendor","sole source","rfp","rfq","purchase order","po"],
    "money_flow"    : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
}

# =================== Label cleanup helpers ===================
EN_STOP = {
    "the","of","and","to","in","is","for","on","at","with","from","by","or","as",
    "that","this","it","be","are","was","were","an","a","you","your","we","our","us",
    "re","fwd","fw","hi","hello","thanks","thank","regards","best","please","dear","mr","mrs",
    "message","original","forwarded","attached","attachment","confidential","notice","disclaimer",
    "herein","thereof","hereby","therein","regarding","subject","url","via","kind","regard","ny"
}
HE_STOP = {"של","על","זה","גם","אם","לא","את","אתה","אני","הוא","היא","הם","הן","כי","מה","שלום","תודה","בברכה","מצורף","הודעה","קדימה","היי"}
MONTHS = {
    "jan","feb","mar","apr","may","jun","jul","aug","sep","sept","oct","nov","dec",
    "january","february","march","april","june","july","august","september",
    "october","november","december"
}
STOP_TERMS = {
    "div","span","nbsp","href","src","img","class","style","align","border","cid",
    "content","content-type","multipart","alternative","quoted","printable","utf",
    "windows-1255","iso-8859","us-ascii","html","plain","attachment","filename",
    "type","id","service","person","generated","fyi"
}
AUX_STOP = {
    "will","would","should","could","can","cant","cannot","did","do","does","done",
    "have","has","had","having","get","got","make","made","let","need","want",
    "not","dont","didnt","isnt","arent","wasnt","werent","im","youre","hes","shes",
    "weve","ive","theyre","its","ok","okay","pls","please","thx","thanks","regards","best",
    "hi","hello","dear","re","fw","fwd","via","kind"
}
CTA_STOP = {
    "click","here","unsubscribe","view","browser","mailto","reply","iphone","android",
    "press","link","below","above","update","newsletter","manage","preferences",
    "לחץ","כאן","נשלח","מה","מה-iphone","הטלפון"
}
TECH_META = {
    "quot","nbsp","cid","href","src","img","class","style","div","span","http","https",
    "content","content-type","multipart","alternative","quoted","printable","utf",
    "windows-1255","iso-8859","us-ascii","attachment","filename"
}
ZH_HEADER_STOP = {"发送时间","星期","星期一","星期二","星期三","星期四","星期五","星期六","星期日","转发","主题","收件人","发件人"}
HE_EXTRA_STOP = {"עם","או"}

STOP_TERMS |= AUX_STOP | CTA_STOP | TECH_META | ZH_HEADER_STOP | HE_EXTRA_STOP
EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
YEAR_RE = re.compile(r"^(19|20)\d{2}$")
NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
ONE_CHAR_RE = re.compile(r"^.$")

LONG_ALNUM_RE   = re.compile(r"^[A-Za-z0-9_-]{24,}$")
HEXISH_RE       = re.compile(r"^(?:[A-Fa-f0-9]{8,})$")
DIGIT_HEAVY_RE  = re.compile(r"^(?:\D*\d){6,}\D*$")
UNDERSCORE_HEAVY_RE = re.compile(r"^[A-Za-z0-9]*_[A-Za-z0-9_]*$")

STOPWORD_FOR_VEC = sorted(EN_STOP | HE_STOP | STOP_TERMS)

def _is_junk_term(t: str) -> bool:
    tl = (t or "").strip().lower()
    if not tl: return True
    if tl in STOP_TERMS or tl in EN_STOP or tl in HE_STOP or tl in MONTHS: return True
    if EMAIL_LIKE_RE.search(tl): return True
    if YEAR_RE.match(tl): return True
    if NUMERIC_RE.match(tl): return True
    if ONE_CHAR_RE.match(tl): return True
    if LONG_ALNUM_RE.match(t): return True
    if HEXISH_RE.match(t): return True
    if DIGIT_HEAVY_RE.match(t): return True
    if UNDERSCORE_HEAVY_RE.match(t): return True
    if len(t) > 40: return True
    return False

def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
    ordered = idxs[np.argsort(-mean_vec[idxs])]
    cleaned = []
    for i in ordered:
        term = names[i]
        if _is_junk_term(term):
            continue
        cleaned.append(term)
        if len(cleaned) >= want:
            break
    if len(cleaned) < max(2, want//2):
        for i in ordered:
            term = names[i]
            if EMAIL_LIKE_RE.search(term) or YEAR_RE.match(term.lower()):
                continue
            if term not in cleaned:
                cleaned.append(term)
            if len(cleaned) >= want:
                break
    return cleaned

# =================== HTML/Text & Header Parsing ===================
def html_to_text(html: str) -> str:
    if not html:
        return ""
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style"]):
        tag.decompose()
    return soup.get_text(separator="\n")

def strip_quotes_and_sigs(text: str) -> str:
    if not text:
        return ""
    text = QUOTE_LINE_RE.sub("", text)
    parts = SIG_RE.split(text)
    if parts:
        text = parts[0]
    text = SENT_FROM_RE.sub("", text)
    text = HEBREW_SENT_FROM_RE.sub("", text)
    cut = None
    for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
        m = pat.search(text)
        if m:
            idx = m.start()
            cut = idx if (cut is None or idx < cut) else cut
    if cut is not None:
        text = text[:cut]
    text = re.sub(r"\n\s*sent from my .*?$", "", text, flags=re.I|re.M)
    text = re.sub(r"\n\s*(נשלח מה-?iphone).*?$", "", text, flags=re.I|re.M)
    return text.strip()

def parse_name_email(s: str) -> Tuple[str, str]:
    if not s:
        return "", ""
    m = re.match(r'(?:"?([^"]*)"?\s)?<?([^<>]+@[^<>]+)>?', s)
    if m:
        return (m.group(1) or "").strip(), (m.group(2) or "").strip()
    return "", s.strip()

def parse_multi_emails(s: str) -> List[str]:
    if not s: return []
    parts = re.split(r",\s*(?=[^,]*@)", s)
    emails = []
    for p in parts:
        _, e = parse_name_email(p.strip())
        if e: emails.append(e)
    return emails

def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
    headers: Dict[str, str] = {}
    lines = (text or "").splitlines()
    header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject|Subject:|To:|Cc:|Bcc:|From:):')
    i = 0
    saw_header = False
    while i < len(lines):
        line = lines[i].rstrip("\r")
        stripped = line.strip()
        if stripped == "":
            i += 1
            break
        if header_pat.match(line):
            saw_header = True
            key, rest = line.split(":", 1)
            key = key.strip()
            value = rest.strip()
            if value == "":
                j = i + 1
                cont = []
                while j < len(lines):
                    nxt = lines[j].rstrip("\r")
                    nxts = nxt.strip()
                    if nxts == "" or header_pat.match(nxt):
                        break
                    if key.lower() == "subject":
                        if FWD_BEGIN_RE.match(nxts) or FWD_MSG_RE.match(nxts) or ON_WROTE_RE.match(nxts):
                            break
                        if len(cont) > 0:
                            break
                    cont.append(nxts)
                    j += 1
                value = " ".join(cont)
                i = j
            else:
                i += 1
            headers[key] = value
            continue
        else:
            if saw_header:
                break
            else:
                break
    body_text = "\n".join(lines[i:]) if i < len(lines) else ""
    return headers, body_text

# =================== Normalization & Utilities ===================
def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[str, Any]:
    if str(raw.get("type", "")).lower() == "meta":
        return {}

    attach_names = []
    atts = raw.get("attachments") or raw.get("Attachments") or raw.get("files") or []
    if isinstance(atts, list):
        for a in atts:
            if isinstance(a, dict):
                name = a.get("filename") or a.get("name") or ""
            else:
                name = str(a)
            if name:
                attach_names.append(str(name))

    body_text_raw = raw.get("body_text") or raw.get("text") or ""
    html_content  = raw.get("body_html") or raw.get("html") or ""
    if html_content and not body_text_raw:
        body_text_raw = html_to_text(html_content)
    body_text_raw = ftfy.fix_text(body_text_raw or "")

    subject_text = ""
    from_name = from_email = from_domain = ""
    to_emails: List[str] = []
    date_val = raw.get("date") or raw.get("Date") or ""

    if body_text_raw:
        headers, body_only = parse_email_headers(body_text_raw)
        subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
        sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
        date_val = headers.get("Date", "") or date_val
        to_emails = parse_multi_emails(headers.get("To","") or (raw.get("to") or "")) + \
                    parse_multi_emails(headers.get("Cc","") or (raw.get("cc") or ""))

        body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
        body_clean = URL_RE.sub(" URL ", body_clean)
        body_clean = re.sub(r"\s+", " ", body_clean).strip()
        body_text = body_clean

        from_name, from_email = parse_name_email(sender)
        from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
    else:
        subject_text = ftfy.fix_text(raw.get("subject") or raw.get("Subject") or "").strip()
        body_text = ftfy.fix_text(raw.get("body_text") or raw.get("text") or "")
        body_text = URL_RE.sub(" URL ", body_text)
        body_text = strip_quotes_and_sigs(body_text)
        body_text = re.sub(r"\s+", " ", body_text).strip()
        sender = raw.get("from") or raw.get("From") or ""
        from_name, from_email = parse_name_email(sender)
        from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
        to_emails = parse_multi_emails(raw.get("to") or "") + parse_multi_emails(raw.get("cc") or "")

    subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()

    if use_langdetect:
        try:
            lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
        except Exception:
            lang = fast_lang_heuristic(subject_norm + " " + (body_text or ""))
    else:
        lang = fast_lang_heuristic(subject_norm + " " + (body_text or ""))

    iso_date = ""
    if isinstance(date_val, (int, float)):
        try:
            iso_date = pd.to_datetime(int(date_val), unit="s", utc=True).isoformat()
        except Exception:
            iso_date = ""
    elif isinstance(date_val, str) and date_val:
        iso_date = pd.to_datetime(date_val, utc=True, errors="coerce").isoformat()

    msg_id = raw.get("message_id") or raw.get("Message-ID") or ""
    if not msg_id:
        msg_id = f"gen-{uuid.uuid4().hex}"

    thread_key = subject_norm or (from_email + body_text[:120])
    thread_id = str(pd.util.hash_pandas_object(pd.Series([thread_key], dtype="string")).astype("uint64").iloc[0])
    text_hash = str(pd.util.hash_pandas_object(pd.Series([body_text], dtype="string")).astype("uint64").iloc[0]) if body_text else ""

    return {
        "message_id": str(msg_id),
        "thread_id": thread_id,
        "date": iso_date,
        "from_name": from_name,
        "from_email": from_email,
        "from_domain": from_domain,
        "to_emails": to_emails,
        "subject": subject_norm,
        "body_text": body_text,
        "lang": lang,
        "attachments": attach_names,
        "text_hash": text_hash,
    }

def has_suspect_tag(text: str) -> List[str]:
    tags = []
    if not text:
        return tags
    low = text.lower()
    for phrase in SUSPECT_PHRASES:
        if phrase in low:
            tags.append("🚩suspect")
            break
    if "invoice" in low or "payment" in low or "contract" in low:
        tags.append("finance")
    if "wire" in low or "transfer" in low or "cash" in low:
        if "finance" not in tags:
            tags.append("finance")
    if OFFCHANNEL_RE.search(low) or EVASIVE_ACRO_RE.search(low):
        tags.append("off-channel")
    return tags

def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
    if not VADER_OK:
        df["sentiment_score"] = np.nan
        df["sentiment"] = "(unknown)"
        return df
    analyzer = SentimentIntensityAnalyzer()
    scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
    bins = [-1.01, -0.05, 0.05, 1.01]
    labels = ["negative", "neutral", "positive"]
    df["sentiment_score"] = scores
    df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
    return df

def _compile_highlight_terms(row: pd.Series, extra_terms: List[str]) -> List[str]:
    terms = []
    txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
    for p in SUSPECT_PHRASES:
        if p in txt:
            terms.append(p)
    if MONEY_RE.search(txt):   terms.append("$")
    if INVOICE_RE.search(txt): terms.append("invoice")
    if PHONE_RE.search(row.get("body_text","") or ""): terms.append("phone")
    for t in extra_terms or []:
        t=t.strip()
        if t and t.lower() in txt:
            terms.append(t)
    out, seen = [], set()
    for t in terms:
        if t.lower() not in seen:
            out.append(t); seen.add(t.lower())
    return out[:24]

def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None,
                           cluster_label: Optional[str] = None,
                           do_highlight: bool = True,
                           extra_terms: Optional[List[str]] = None) -> str:
    subject = (row.get("subject") or "").strip()
    body    = (row.get("body_text") or "").strip()
    from_email = row.get("from_email") or ""
    date    = row.get("date") or ""
    tags    = row.get("tags") or []
    flags   = row.get("flags") or []
    sentiment = row.get("sentiment") or "(unknown)"

    hl_terms = []
    if do_highlight:
        hl_terms = (query_terms or []) + _compile_highlight_terms(row, extra_terms or [])
        seen=set(); uniq=[]
        for t in hl_terms:
            tl=t.lower()
            if tl and tl not in seen:
                uniq.append(t); seen.add(tl)
        hl_terms = uniq[:24]

    def hi(text: str) -> str:
        if not text or not do_highlight or not hl_terms:
            return text
        out = text
        for qt in hl_terms:
            if not qt:
                continue
            try:
                pat = re.compile(re.escape(qt), re.I)
                out = pat.sub(lambda m: f"<mark>{m.group(0)}</mark>", out)
            except Exception:
                pass
        return out

    subject_h = hi(subject)
    body_h    = hi(body)

    rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
    dir_attr = ' dir="rtl"' if rtl else ""
    body_html = body_h.replace("\n", "<br/>")

    def pill(s, cls="tag"):
        return f'<span class="{cls}">{s}</span>'

    tag_html = ""
    pills = []
    if isinstance(tags, list) and tags:
        pills += [pill(t, "tag") for t in tags]
    if isinstance(flags, list) and flags:
        pills += [pill(f, "tag") for f in flags]
    if pills:
        tag_html = " ".join(pills)

    cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""

    html = (
        f'<div class="email-card">'
        f'  <div class="email-header">'
        f'    <div>'
        f'      <div class="subject">{subject_h or "(no subject)"}</div>'
        f'      <div class="meta">From: <b>{from_email}</b> • Date: {date or "—"}</div>'
        f'    </div>'
        f'    <div class="badges">'
        f'      {cluster_html}'
        f'      <span class="sentiment">sentiment: <b>{sentiment}</b></span>'
        f'      {tag_html}'
        f'    </div>'
        f'  </div>'
        f'  <div class="email-body"{dir_attr}>'
        f'    {body_html}'
        f'  </div>'
        f'</div>'
    )
    return html

# ---------- Lightweight Embedding Utilities (Optional) ----------
def _load_embeddings(emb_path: str, binary: bool):
    if not GENSIM_OK or not emb_path or not os.path.exists(emb_path):
        return None, 0
    try:
        if binary:
            kv = KeyedVectors.load_word2vec_format(emb_path, binary=True)
        else:
            kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=False)
        return kv, int(kv.vector_size)
    except Exception:
        try:
            kv = KeyedVectors.load_word2vec_format(emb_path, binary=False, no_header=True)
            return kv, int(kv.vector_size)
        except Exception:
            return None, 0

def _avg_embed_for_text(text: str, kv, dim: int) -> np.ndarray:
    vec = np.zeros((dim,), dtype=np.float32)
    if not kv or not text:
        return vec
    toks = re.findall(TOKEN_PATTERN, text.lower())
    cnt = 0
    for t in toks:
        if t in kv:
            vec += kv[t]
            cnt += 1
    if cnt > 0:
        vec /= float(cnt)
        n = np.linalg.norm(vec)
        if n > 0:
            vec /= n
    return vec

def _build_doc_embeddings(texts: List[str], kv, dim: int) -> np.ndarray:
    if not kv or dim <= 0:
        return np.zeros((len(texts), 0), dtype=np.float32)
    out = np.zeros((len(texts), dim), dtype=np.float32)
    for i, t in enumerate(texts):
        out[i, :] = _avg_embed_for_text(t or "", kv, dim)
    return out

# =================== Feature engineering (BM25 + char) ===================
class BM25Transformer:
    def __init__(self, k1=1.2, b=0.75):
        self.k1 = k1
        self.b  = b
        self.idf_ = None
        self.avgdl_ = None

    def fit(self, X):
        N = X.shape[0]
        df = np.bincount(X.tocsc().indices, minlength=X.shape[1]).astype(np.float64)
        self.idf_ = np.log((N - df + 0.5) / (df + 0.5 + 1e-12))
        dl = np.asarray(X.sum(axis=1)).ravel()
        self.avgdl_ = float(dl.mean() if dl.size else 1.0)
        return self

    def transform(self, X):
        X = X.tocsr(copy=True).astype(np.float32)
        dl = np.asarray(X.sum(axis=1)).ravel()
        k1, b, avgdl = self.k1, self.b, self.avgdl_
        rows, cols = X.nonzero()
        data = X.data
        for i in range(len(data)):
            tf = data[i]
            d  = rows[i]
            denom = tf + k1 * (1 - b + b * (dl[d] / (avgdl + 1e-12)))
            data[i] = (self.idf_[cols[i]] * (tf * (k1 + 1))) / (denom + 1e-12)
        return X

def enrich_text(row: pd.Series) -> str:
    subj = row.get("subject","") or ""
    body = row.get("body_text","") or ""
    t = subj + "\n\n" + body
    tokens = []
    if MONEY_RE.search(t):   tokens.append("__HAS_MONEY__")
    if PHONE_RE.search(t):   tokens.append("__HAS_PHONE__")
    if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
    if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
    if OFFCHANNEL_RE.search(t): tokens.append("__OFF_CHANNEL__")
    lang_tok = f'__LANG_{(row.get("lang") or "unk").lower()}__'
    tokens.append(lang_tok)
    return (t + " " + " ".join(tokens)).strip()

# =================== Cluster labeling: PMI + class-TFIDF + SUBJECT BOOST (+ coverage ≥30% preference) ===================
def cluster_labels_pmi_bigram(
    texts,
    labels,
    subjects=None,
    topn=6,
    subject_alpha=0.75,
    global_ubiq_cut=0.20,
    subject_min_cov=0.30
):
    import math as _math
    from collections import Counter, defaultdict
    from sklearn.feature_extraction.text import TfidfVectorizer

    HEADER_STOP = {"subject","re","fw","fwd","to","cc","bcc","from","sent","forwarded",
                   "回复","主题","收件人","发件人"}

    def is_junk_token(tok: str) -> bool:
        if _is_junk_term(tok): return True
        tl = tok.lower()
        if tl.startswith("__"): return True
        if "@" in tl: return True
        if tl.isascii() and len(tl) <= 2: return True
        if LONG_ALNUM_RE.match(tok) or HEXISH_RE.match(tok) or DIGIT_HEAVY_RE.match(tok): return True
        if len(tok) > 40: return True
        if re.search(r"[^\w\-’']", tl): return True
        return False

    def tokenize_clean(t):
        toks = re.findall(TOKEN_PATTERN, (t or "").lower())
        return [w for w in toks if not is_junk_token(w)]

    def ngrams(toks, n):
        return [" ".join(p) for p in zip(*[toks[i:] for i in range(n)]) if all(not is_junk_token(x) for x in p)]

    glob_df_uni = Counter()
    glob_df_bg  = Counter()
    glob_df_tri = Counter()
    per_c_bg = defaultdict(Counter)
    per_c_tri = defaultdict(Counter)
    per_c_texts = defaultdict(list)
    per_c_doc_count = defaultdict(int)
    per_c_subj_uni_docs = defaultdict(Counter)
    per_c_subj_bg_docs  = defaultdict(Counter)
    per_c_subj_tri_docs = defaultdict(Counter)

    have_subjects = subjects is not None and len(subjects) == len(texts)

    for idx, (txt, c) in enumerate(zip(texts, labels)):
        c = int(c)
        toks = tokenize_clean(txt)
        uni_set = set(toks)
        bg_set  = set(ngrams(toks, 2))
        tri_set = set(ngrams(toks, 3))
        glob_df_uni.update(uni_set)
        glob_df_bg.update(bg_set)
        glob_df_tri.update(tri_set)
        per_c_bg[c].update(bg_set)
        per_c_tri[c].update(tri_set)
        per_c_texts[c].append(" ".join(toks))
        per_c_doc_count[c] += 1
        if have_subjects:
            stoks = tokenize_clean(subjects[idx] or "")
            s_uni = set(stoks)
            s_bg  = set(ngrams(stoks, 2))
            s_tri = set(ngrams(stoks, 3))
            per_c_subj_uni_docs[c].update(s_uni)
            per_c_subj_bg_docs[c].update(s_bg)
            per_c_subj_tri_docs[c].update(s_tri)

    N = max(1, len(texts))

    def too_ubiquitous(df_count):
        return (df_count / float(N)) > float(global_ubiq_cut)

    labels_out = {}
    for c in sorted(set(int(x) for x in labels)):
        n_docs_c = max(1, per_c_doc_count[c])
        phrases = []
        for store, glob_df, subj_docs, n in (
            (per_c_bg[c],  glob_df_bg,  per_c_subj_bg_docs[c], 2),
            (per_c_tri[c], glob_df_tri, per_c_subj_tri_docs[c], 3),
        ):
            total_c = sum(store.values()) + 1e-12
            total_g = sum(glob_df.values()) + 1e-12
            scored = []
            for ng, cnt in store.most_common(3000):
                if too_ubiquitous(glob_df[ng]):
                    continue
                p_ng_c = cnt / total_c
                p_ng_g = (glob_df[ng] / total_g)
                if p_ng_c > 0 and p_ng_g > 0:
                    score = _math.log(p_ng_c) - _math.log(p_ng_g)
                    cov = 0.0
                    if have_subjects:
                        cov = subj_docs[ng] / n_docs_c
                        if cov >= subject_min_cov:
                            score += 0.6
                    score += subject_alpha * cov
                    scored.append((score, cov, ng))
            scored.sort(key=lambda x: (x[1] >= subject_min_cov, x[0]), reverse=True)
            take = max(1, topn // (3 if n == 3 else 2))
            phrases.extend([p for _, _, p in scored[:take]])

        docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
        docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k != c), [])) or " "]
        corpus = [docs_c[0], docs_bg[0]]
        vec = TfidfVectorizer(
            analyzer="word", ngram_range=(1,1),
            max_features=3000, token_pattern=TOKEN_PATTERN, lowercase=True
        )
        X = vec.fit_transform(corpus)
        vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
        row = X[0].toarray().ravel()

        subj_cov = np.zeros_like(row)
        subj_cov_frac = np.zeros_like(row)
        vocab_index = {t:i for i,t in enumerate(vocab)}
        if have_subjects:
            for tok, cnt_docs in per_c_subj_uni_docs[c].items():
                if tok in vocab_index and not _is_junk_term(tok):
                    i = vocab_index[tok]
                    frac = cnt_docs / n_docs_c
                    subj_cov[i] = frac
                    subj_cov_frac[i] = frac

        row_boosted = row + subject_alpha * subj_cov
        pref_bump = (subj_cov_frac >= subject_min_cov).astype(row_boosted.dtype) * 0.6
        final = row_boosted + pref_bump

        order = final.argsort()[::-1]
        unis = []
        for i in order:
            tok = vocab[i]
            if _is_junk_term(tok): continue
            if too_ubiquitous(glob_df_uni.get(tok, 0)):
                continue
            unis.append(tok)
            if len(unis) >= max(0, topn - len(phrases)):
                break

        parts = (phrases + unis)[:max(2, topn)]
        labels_out[c] = ", ".join(parts) if parts else f"cluster_{c}"

    return labels_out

# =================== Auto-k & merge ===================
def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
    n = X.shape[0]
    if n <= 1:
        return 1, {1: 0.0}
    if n < min(ks):
        k_small = max(2, min(10, n))
        return int(k_small), {int(k_small): 0.0}

    if n > 40000:
        rs = np.random.RandomState(0)
        idx = rs.choice(n, size=40000, replace=False)
        Xs = X[idx]
    else:
        Xs = X
    inertias = []
    for k in ks:
        k = int(k)
        if n < k:
            break
        km = MiniBatchKMeans(n_clusters=k, batch_size=4096, random_state=0, n_init="auto")
        km.fit(Xs)
        inertias.append(km.inertia_)
    if not inertias:
        k_small = max(2, min(10, n))
        return int(k_small), {int(k_small): 0.0}
    x = np.array(list(ks)[:len(inertias)], dtype=float)
    y = np.array(inertias, dtype=float)
    y_norm = (y - y.min()) / (y.max() - y.min() + 1e-9)
    x_norm = (x - x.min()) / (x.max() - x.min() + 1e-9)
    chord = y_norm[0] + (y_norm[-1] - y_norm[0]) * (x_norm - x_norm[0])/(x_norm[-1]-x_norm[0]+1e-9)
    dist = chord - y_norm
    k_best = int(x[np.argmax(dist)])
    return k_best, dict(zip(x.astype(int), inertias))

def auto_k_rule(n_docs: int) -> int:
    return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))

def merge_close_clusters(labels, centers, thresh=0.92):
    centers = sk_normalize(centers)
    sim = cosine_similarity(centers, centers)
    k = centers.shape[0]
    parent = list(range(k))
    def find(a):
        while parent[a]!=a: a=parent[a]
        return a
    for i in range(k):
        for j in range(i+1, k):
            if sim[i,j] >= thresh:
                pi, pj = find(i), find(j)
                if pi!=pj: parent[pj]=pi
    root = {i:find(i) for i in range(k)}
    idmap, new_id = {}, 0
    for i in range(k):
        r = root[i]
        if r not in idmap:
            idmap[r] = new_id
            new_id += 1
    labels2 = np.array([idmap[root[int(c)]] for c in labels], dtype=int)
    return labels2

def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVectorizer,
                            lsa_components: np.ndarray, norm_obj: Normalizer,
                            d_word: int, d_full: int, k: int) -> Optional[np.ndarray]:
    seeds_word = []
    vocab = count_vec.vocabulary_
    for _, words in lexicons.items():
        idxs = [vocab.get(w.lower()) for w in words if vocab.get(w.lower()) is not None]
        if not idxs:
            continue
        v = np.zeros((d_word,), dtype=np.float32)
        v[idxs] = 1.0
        n = np.linalg.norm(v)
        if n > 0:
            v /= n
            seeds_word.append(v)
    if not seeds_word:
        return None
    seeds_full = []
    for v in seeds_word:
        vf = np.zeros((d_full,), dtype=np.float32)
        vf[:d_word] = v
        seeds_full.append(vf)
    seeds_full = np.stack(seeds_full, axis=0)
    seeds_red = seeds_full @ lsa_components.T
    seeds_red = norm_obj.transform(seeds_red.astype(np.float32))
    if seeds_red.shape[0] >= 2 and seeds_red.shape[0] <= k:
        return seeds_red
    return None

# =================== NEW: cluster stabilizer ===================
def _centroids_from_labels(X, labels):
    labs = np.asarray(labels, dtype=int)
    uniq = np.unique(labs)
    cents = {}
    if isinstance(X, np.ndarray):
        for c in uniq:
            idx = (labs == c)
            if not np.any(idx): continue
            v = X[idx].mean(axis=0)
            n = np.linalg.norm(v)
            if n > 0: v = v / n
            cents[int(c)] = v.astype(np.float32)
        return cents
    X = X.tocsr()
    for c in uniq:
        rows = np.where(labs == c)[0]
        if rows.size == 0: continue
        sub = X[rows]
        v = np.asarray(sub.mean(axis=0)).ravel()
        n = np.linalg.norm(v)
        if n > 0: v = v / n
        cents[int(c)] = v.astype(np.float32)
    return cents

def _cosine_sim_to_centroids(vecs, centroids):
    if not centroids:
        return None, None
    keys = list(centroids.keys())
    C = np.stack([centroids[k] for k in keys], axis=0)
    if isinstance(vecs, np.ndarray):
        sims = vecs @ C.T
    else:
        sims = vecs.dot(C.T)
        if hasattr(sims, "toarray"): sims = sims.toarray()
    best_idx = np.argmax(sims, axis=1)
    best_lab = np.array([keys[i] for i in best_idx], dtype=int)
    best_sim = sims[np.arange(sims.shape[0]), best_idx]
    return best_lab, best_sim

def stabilize_labels(X_space, labels, min_size=40, merge_thresh=0.96, reassign_thresh=0.35):
    labs = np.asarray(labels, dtype=int)
    cents = _centroids_from_labels(X_space, labs)
    keys = sorted([k for k in cents.keys() if k >= 0])
    if len(keys) >= 2:
        C = np.stack([cents[k] for k in keys], axis=0)
        sims = C @ C.T
        parent = {k:k for k in keys}
        def find(a):
            while parent[a]!=a:
                a = parent[a]
            return a
        for i in range(len(keys)):
            for j in range(i+1, len(keys)):
                if sims[i,j] >= float(merge_thresh):
                    ri, rj = find(keys[i]), find(keys[j])
                    if ri != rj:
                        parent[rj] = ri
        root = {k: find(k) for k in keys}
        merge_map = {k: root[k] for k in keys}
        labs = np.array([merge_map.get(int(c), int(c)) for c in labs], dtype=int)
        cents = _centroids_from_labels(X_space, labs)

    vc = pd.Series(labs).value_counts()
    big_labs = set(vc[vc >= int(min_size)].index.tolist())
    small_labs = set(vc[vc < int(min_size)].index.tolist())
    big_cents = {c: cents[c] for c in big_labs if c in cents and c >= 0}

    NOISE_ID = -3
    if small_labs and big_cents:
        idx_small = np.where(pd.Series(labs).isin(small_labs))[0]
        if idx_small.size > 0:
            sub = X_space[idx_small] if not isinstance(X_space, np.ndarray) else X_space[idx_small]
            best_lab, best_sim = _cosine_sim_to_centroids(sub, big_cents)
            reassigned = np.where(best_sim >= float(reassign_thresh), best_lab, NOISE_ID)
            labs[idx_small] = reassigned

    return labs

# =================== Scoring & Flags ===================
def _hour_of(dt_iso: str) -> Optional[int]:
    try:
        if not dt_iso: return None
        dt = pd.to_datetime(dt_iso, utc=True, errors="coerce")
        if pd.isna(dt): return None
        return int(dt.hour)
    except Exception:
        return None

def _attachment_flags(names: List[str]) -> List[str]:
    flags=[]
    for n in names or []:
        if ATTACH_NAME_RE.search(n):
            flags.append("📎 " + n[:40])
    return flags[:5]

def corruption_score(row, trusted_domains: set):
    score = 0.0
    txt = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
    for ph in SUSPECT_PHRASES:
        if ph in txt:
            score += 2.0
            break
    if EVASIVE_ACRO_RE.search(txt) or OFFCHANNEL_RE.search(txt):
        score += 1.0
    if isinstance(row.get("tags"), list) and ("🚩suspect" in row["tags"] or "finance" in row["tags"]):
        score += 1.5
    if MONEY_RE.search(txt):   score += 0.7
    if INVOICE_RE.search(txt): score += 0.7
    if str(row.get("sentiment","")) == "negative":
        score += 0.3
    body_len = len(row.get("body_text",""))
    if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
        score += 0.5
    fd = (row.get("from_domain") or "").lower()
    if fd in PERSONAL_DOMAINS and fd not in trusted_domains:
        score += 0.5
    h = _hour_of(row.get("date") or "")
    if h is not None and (h < 6 or h > 22):
        score += 0.3
    return score

def compute_context_anomaly(df_in: pd.DataFrame) -> pd.DataFrame:
    if df_in.empty:
        df_in["context_anomaly_score"] = 0.0
        return df_in

    df = df_in.copy()
    if "anomaly_score" in df.columns:
        df["_if_pct"] = 0.0
        for bkt, grp in df.groupby("bucket", dropna=False):
            vals = grp["anomaly_score"].astype(float)
            if vals.notna().sum() >= 5:
                ranks = vals.rank(pct=True, ascending=False)
                df.loc[grp.index, "_if_pct"] = ranks.clip(0, 1)
        df["_if_pts"] = (df["_if_pct"] * 6.0).clip(0, 6)
    else:
        df["_if_pts"] = 0.0

    df["_rule_pts"] = 0.0
    low = (df["subject"].fillna("") + " " + df["body_text"].fillna("")).str.lower()
    for bkt, terms in TAXONOMY.items():
        mask = (df["bucket"] == bkt)
        if not mask.any():
            continue
        if terms:
            has_term = low.str.contains("|".join([re.escape(t.lower()) for t in terms]), regex=True)
            df.loc[mask & (~has_term), "_rule_pts"] += 1.0
        if bkt == "Constituent":
            df.loc[mask & (~df["from_domain"].str.lower().isin(PERSONAL_DOMAINS)), "_rule_pts"] += 1.0
        if bkt == "Scheduling":
            subj = df.loc[mask, "subject"].fillna("").str.lower()
            df.loc[mask & (~subj.str.contains(r"\bmeeting|invite|schedule|calendar\b", regex=True)), "_rule_pts"] += 1.0

    df["_rule_pts"] = df["_rule_pts"].clip(0, 2)
    df["_corr_pts"] = df["corruption_score"].fillna(0).clip(0, 3)

    df["context_anomaly_score"] = (df["_if_pts"] + df["_rule_pts"] + df["_corr_pts"]).clip(0, 10)
    return df.drop(columns=["_if_pct","_if_pts","_rule_pts","_corr_pts"], errors="ignore")

# =================== 🔧 NEW: Per-bucket k & stabilizer params ===================
def _bucket_k_multiplier(bucket_name: str) -> float:
    b = (bucket_name or "").lower()
    if b in ("constituent",):               return 1.25
    if b in ("procurement", "campaign finance", "receipts/billing", "lobbyist"):  return 1.15
    if b in ("scheduling", "other"):        return 1.00
    if b in ("legal",):                     return 0.80
    return 1.00

def _bucket_stabilizer_params(bucket_name: str) -> Tuple[int, float, float]:
    b = (bucket_name or "").lower()
    if b == "legal":             return (30, 0.97, 0.38)
    if b == "procurement":       return (35, 0.96, 0.36)
    if b == "campaign finance":  return (35, 0.96, 0.36)
    if b == "constituent":       return (40, 0.95, 0.33)
    if b == "receipts/billing":  return (40, 0.95, 0.35)
    if b == "scheduling":        return (35, 0.95, 0.35)
    return (40, 0.96, 0.35)

# =================== 🔧 NEW: Label de-dup helpers ===================
def _normalize_label_tokens(label: str) -> set:
    if not label: return set()
    txt = str(label).lower()
    toks = re.findall(r"[a-z\u0590-\u05FF][a-z\u0590-\u05FF\-']{1,}", txt)
    toks2 = [t[:-1] if len(t) > 3 and t.endswith("s") else t for t in toks]
    return {t for t in toks2 if t not in STOP_TERMS and t not in EN_STOP and t not in HE_STOP and len(t) >= 2}

def _jaccard(a: set, b: set) -> float:
    if not a or not b: return 0.0
    inter = len(a & b)
    if inter == 0: return 0.0
    return inter / float(len(a | b))

def dedupe_cluster_labels_in_bucket(df: pd.DataFrame, bucket: str, sim_thresh: float = 0.72) -> pd.DataFrame:
    sel = df[df["bucket"] == bucket].copy()
    if sel.empty or "cluster_name" not in sel.columns:
        return df

    names = sel[["cluster_id", "cluster_name"]].drop_duplicates()
    tokens = {int(cid): _normalize_label_tokens(str(name)) for cid, name in names.values}
    ids = list(tokens.keys())

    parent = {i: i for i in ids}
    def find(i): 
        while parent[i] != i: i = parent[i]
        return i
    def union(a, b):
        ra, rb = find(a), find(b)
        if ra != rb: parent[rb] = ra

    for i in range(len(ids)):
        for j in range(i+1, len(ids)):
            if _jaccard(tokens[ids[i]], tokens[ids[j]]) >= sim_thresh:
                union(ids[i], ids[j])

    names_map = dict(names.values)
    comp_to_canon = {}
    for cid in ids:
        root = find(cid)
        comp_to_canon.setdefault(root, [])
        comp_to_canon[root].append((cid, names_map.get(cid, "")))

    canon_for_cluster = {}
    for root, items in comp_to_canon.items():
        best = max(items, key=lambda kv: (len(kv[1] or ""), kv[1]))
        for cid, _ in items:
            canon_for_cluster[cid] = best[1]

    df.loc[sel.index, "cluster_name"] = sel["cluster_id"].map(lambda c: canon_for_cluster.get(int(c), names_map.get(int(c), "")))
    return df

def dedupe_all_labels(df: pd.DataFrame) -> pd.DataFrame:
    out = df
    for bkt in sorted(df["bucket"].dropna().unique()):
        out = dedupe_cluster_labels_in_bucket(out, bkt, sim_thresh=0.72)
    return out

# =================== 🔎 NEW: Surveillance-campaign detection ===================
SURV_KEYWORDS = [
    "daily report","daily brief","briefing","sitreps","sitrep","situation report","summary",
    "dossier","monitoring","tracking","watchlist","watch list","profile","surveillance",
    "intel","intelligence","osint","open source intel","clippings","press clips","digest",
    "alert","alerting","dispatch","bulletin","roundup","update"
]
SURV_RE = re.compile("|".join([re.escape(k) for k in SURV_KEYWORDS]), re.I)

SUBJ_NUM_RE   = re.compile(r"\b\d{1,4}([,./-]\d{1,4})*\b")
SUBJ_DATE_RE  = re.compile(r"\b(?:\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?|\d{4}-\d{2}-\d{2}|jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)\b", re.I)
SUBJ_FW_RE    = re.compile(r"^\s*(re:|fw:|fwd:)\s*", re.I)
EMAIL_RE      = re.compile(r"\b[\w.\-+%]+@[\w.-]+\.[A-Za-z]{2,}\b")

def _candidate_entities_from_subjects(df: pd.DataFrame, extra_watchlist: List[str]) -> List[str]:
    cand = set([w.strip() for w in (extra_watchlist or []) if w.strip()])
    subs = df["subject"].dropna().astype(str).tolist()
    pat = re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\b")
    for s in subs:
        for m in pat.finditer(s):
            name = m.group(1).strip()
            if name.lower() in EN_STOP or len(name) < 5: 
                continue
            cand.add(name)
    out = sorted(cand)
    return out[:3000]

def _normalize_subject_template(subj: str, entity: str) -> str:
    if not subj: return ""
    s = SUBJ_FW_RE.sub("", subj)
    try:
        s = re.sub(re.escape(entity), "«ENTITY»", s, flags=re.I)
    except Exception:
        pass
    s = SUBJ_DATE_RE.sub("«DATE»", s)
    s = SUBJ_NUM_RE.sub("«NUM»", s)
    s = EMAIL_RE.sub("«EMAIL»", s)
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

def _entity_mask_present(row: pd.Series, entity: str) -> bool:
    t = (row.get("subject","") + " " + row.get("body_text","")).lower()
    e = (entity or "").lower()
    return (e in t) if e else False

def detect_surveillance_campaigns(
    df: pd.DataFrame,
    watchlist: Optional[List[str]] = None,
    min_mentions: int = 15,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    if df.empty: 
        return pd.DataFrame(), pd.DataFrame()

    watch = [w.strip() for w in (watchlist or []) if w.strip()]
    cands = _candidate_entities_from_subjects(df, watch)

    dfd = df.copy()
    dfd["_dt"] = pd.to_datetime(dfd["date"], utc=True, errors="coerce")
    dfd["_day"] = dfd["_dt"].dt.date
    dfd["_week"] = dfd["_dt"].dt.to_period("W").astype(str)
    dfd["_is_news"] = (dfd["bucket"] == "Newsletters/Alerts")
    dfd["_is_it"]   = (dfd["bucket"] == "IT/Security")
    dfd["_is_internal"] = ~(dfd["_is_news"] | dfd["_is_it"])
    dfd["_recips"] = dfd["to_emails"].apply(lambda xs: len(xs) if isinstance(xs, list) else 0)

    rows = []
    samples = []

    for entity in cands:
        mask = dfd.apply(lambda r: _entity_mask_present(r, entity), axis=1)
        grp = dfd[mask]
        n = len(grp)
        if n < int(min_mentions):
            continue

        n_senders  = grp["from_email"].nunique()
        n_domains  = grp["from_domain"].nunique()
        pct_news   = float((grp["_is_news"].mean() if n else 0.0))
        pct_int    = float((grp["_is_internal"].mean() if n else 0.0))
        avg_recips = float((grp["_recips"].mean() if n else 0.0))

        wk = grp.groupby("_week").size().astype(float)
        if len(wk) >= 4:
            baseline = wk.iloc[:-1]
            mu = float(baseline.mean()) if len(baseline) else 0.0
            sd = float(baseline.std(ddof=1)) if len(baseline) > 1 else 0.0
            last = float(wk.iloc[-1])
            weekly_peak_z = 0.0 if sd == 0.0 else (last - mu) / sd
        else:
            weekly_peak_z = 0.0

        norm_subj = grp["subject"].fillna("").astype(str).map(lambda s: _normalize_subject_template(s, entity))
        if len(norm_subj):
            top_template_share = norm_subj.value_counts(normalize=True).iloc[0]
        else:
            top_template_share = 0.0

        kw_share = float(((grp["subject"].fillna("") + " " + grp["body_text"].fillna("")).str.contains(SURV_RE).mean()) if n else 0.0)

        score = 0.0
        score += 2.5 * min(1.0, top_template_share)
        score += 2.0 * min(1.0, kw_share)
        score += 1.5 * min(1.0, weekly_peak_z / 3.0)
        score += 0.8 * min(1.0, n_senders / 10.0)
        score += 0.5 * min(1.0, n_domains / 10.0)
        score += 1.0 * pct_int
        score += 0.3 * min(1.0, avg_recips / 10.0)

        level = "info"
        if score >= 6.5: level = "likely"
        elif score >= 4.5: level = "possible"

        first_d = grp["_dt"].min()
        last_d  = grp["_dt"].max()

        rows.append({
            "entity": entity,
            "surveillance_score": round(float(score), 3),
            "level": level,
            "n_emails": int(n),
            "n_senders": int(n_senders),
            "n_domains": int(n_domains),
            "pct_newsletters": round(pct_news, 3),
            "pct_internal": round(pct_int, 3),
            "avg_recipients": round(avg_recips, 2),
            "weekly_peak_z": round(float(weekly_peak_z), 3),
            "template_max_share": round(float(top_template_share), 3),
            "keyword_share": round(float(kw_share), 3),
            "first_date": str(first_d) if pd.notna(first_d) else "",
            "last_date":  str(last_d) if pd.notna(last_d)  else "",
            "notes": "template/keywords/cadence/senders/domains mix"
        })

        ex = grp[["date","from_email","from_domain","subject","bucket"]].copy().head(8)
        ex.insert(0, "entity", entity)
        samples.append(ex)

    ent_df = pd.DataFrame(rows).sort_values(["surveillance_score","n_emails"], ascending=[False, False]).head(200)
    samp_df = pd.concat(samples, ignore_index=True) if samples else pd.DataFrame()

    return ent_df, samp_df

def tag_surveillance_emails(df: pd.DataFrame, ent_df: pd.DataFrame, threshold: float = 4.5) -> pd.DataFrame:
    if df.empty or ent_df.empty: 
        return df
    hot = ent_df[ent_df["surveillance_score"] >= float(threshold)]["entity"].tolist()
    if not hot: return df
    def _tag(row):
        txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
        tags = set(row.get("tags") or [])
        for e in hot:
            if e.lower() in txt:
                tags.add("surveillance")
                break
        return sorted(tags)
    out = df.copy()
    out["tags"] = out.apply(_tag, axis=1)
    return out

# =================== UI / PIPELINE CONTINUATION ===================

# ---------- Styles ----------
CSS = """
:root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
.email-card { background:#ffffff; color:#111827; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.08); }
.email-header { display:flex; align-items:flex-start; justify-content:space-between; gap:12px; }
.subject { color:#0f172a; font-size:18px; font-weight:700; margin-bottom:6px; }
.meta { color:#334155; font-size:12px; }
.badges { display:flex; gap:8px; align-items:center; flex-wrap:wrap; }
.cluster-pill { background:var(--pill); color:var(--pill-text); padding:2px 8px; border-radius:999px; font-size:12px; }
.sentiment { font-size:12px; color:#334155; }
.tag { background:var(--tag); color:var(--tag-text); padding:2px 6px; border-radius:6px; font-size:12px; }
.email-body { margin-top:12px; max-height:520px; overflow:auto; line-height:1.6; white-space:normal; color:#111827; }
.email-body a { color:#1d4ed8; text-decoration:underline; }
mark { background:#fff59d; color:#111827; padding:0 2px; border-radius:2px; }
hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
.small { color:#475569; font-size:12px; }
.cursor { cursor:pointer; }
"""

# ---------- App ----------
with gr.Blocks(title="Email Investigator — Per-bucket-k + Label Dedup + Surveillance Radar", css=CSS, theme="soft") as demo:
    gr.Markdown("# Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans")
    gr.Markdown(
        "This build includes per-bucket **k** heuristics, label **de-dup**, and a **surveillance-campaign detector** "
        "(template cadence + keywords + multi-sender/domain signals)."
    )

    with gr.Row():
        inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])

    with gr.Accordion("Vectorization & Clustering", open=True):
        with gr.Row():
            max_features = gr.Number(label="Word max_features (BM25)", value=120_000, precision=0)
            min_df = gr.Number(label="min_df (doc freq ≥)", value=3, precision=0)
            max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
            use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
            skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
        with gr.Row():
            use_hashing = gr.Checkbox(label="Use HashingVectorizer (memory-light, fast)", value=True)
            hash_bits   = gr.Slider(label="Hashing bits (2^n features)", minimum=16, maximum=20, step=1, value=18)
        with gr.Row():
            use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
            lsa_dim = gr.Number(label="LSA components", value=256, precision=0)
            auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
            k_clusters = gr.Number(label="Base k (before per-bucket multiplier)", value=350, precision=0)
            mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
        with gr.Row():
            use_hdbscan = gr.Checkbox(label="Use HDBSCAN (auto-k, noise) on reduced vectors", value=False)
            hdb_min_cluster = gr.Number(label="HDBSCAN min_cluster_size", value=60, precision=0)
            hdb_min_samples = gr.Number(label="HDBSCAN min_samples (0=auto)", value=0, precision=0)
        with gr.Row():
            per_language = gr.Checkbox(label="Cluster per language (reduces cross-language mixing)", value=True)
        with gr.Row():
            use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
            use_iso = gr.Checkbox(label="Compute anomaly score (IsolationForest on LSA)", value=False)

    with gr.Accordion("Investigation Controls", open=True):
        with gr.Row():
            trusted_domains_in = gr.Textbox(label="Trusted org domains (comma-separated)", value="example.gov, example.org")
            extra_keywords_in  = gr.Textbox(label="Extra suspicious phrases (comma-separated)", value="")
            highlight_toggle   = gr.Checkbox(label="Highlight suspect patterns in reader", value=True)
        with gr.Row():
            use_embeddings = gr.Checkbox(label="Add lightweight word embeddings (avg word2vec/GloVe) if available", value=False)
            embed_weight   = gr.Slider(label="Embedding weight in feature space", minimum=0.0, maximum=1.0, step=0.05, value=0.35)
        with gr.Row():
            embeddings_path = gr.Textbox(label="Path to local embeddings (.txt/.vec/.bin) (optional)", value="")
            embeddings_binary = gr.Checkbox(label="File is binary word2vec format", value=False)
        with gr.Row():
            bucket_drop = gr.Dropdown(label="Bucket", choices=["(any)"] + list(TAXONOMY.keys()), value="(any)", allow_custom_value=False)
            cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
            domain_drop  = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
            sender_drop  = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
            lang_drop    = gr.Dropdown(label="Language", choices=["(any)"], value="(any)", allow_custom_value=False)
            sentiment_drop = gr.Dropdown(label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)")
            tag_drop = gr.Dropdown(label="Tag", choices=["(any)", "🚩suspect", "finance", "off-channel", "surveillance", "odd-hours", "personal-mail"], value="(any)")
        with gr.Row():
            date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
            date_end   = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
            sort_by    = gr.Dropdown(label="Sort by", choices=["context_anomaly_score","corruption_score","date","anomaly_score","search_score"], value="context_anomaly_score")
            sort_dir   = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
        with gr.Row():
            hide_noise = gr.Checkbox(label="Hide noise/unassigned (cluster -3)", value=True)

    with gr.Accordion("Surveillance Radar", open=True):
        with gr.Row():
            watchlist_in = gr.Textbox(label="Watchlist (names or entities, comma-separated)", value="Hillary Clinton, Joe Biden, Donald Trump")
            min_mentions = gr.Number(label="Min mentions per entity", value=15, precision=0)

    with gr.Row():
        run_btn = gr.Button("Process", variant="primary")
        # NEW: Update button lets you re-run with same uploaded file & current settings
        update_btn = gr.Button("Update", variant="secondary")  # NEW: Update
        reset_btn = gr.Button("Reset filters")
    status = gr.Markdown("")

    with gr.Row():
        cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
        domain_counts_df  = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
    with gr.Row():
        sender_counts_df = gr.Dataframe(label="Top senders", interactive=False, wrap=True)

    with gr.Row():
        actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
        offhours_df = gr.Dataframe(label="Off-hours & personal-mail hits", interactive=False, wrap=True)

    gr.Markdown("### Surveillance Campaigns (detected entities)")
    with gr.Row():
        surv_entities_df = gr.Dataframe(label="Entities ranked by surveillance score", interactive=False, wrap=True)
        surv_samples_df = gr.Dataframe(label="Sample emails for highlighted entities", interactive=False, wrap=True)

    gr.Markdown("### Search")
    with gr.Row():
        search_query = gr.Textbox(label="Search (keywords, names, etc.)")
        search_btn = gr.Button("Search")
    results_df = gr.Dataframe(label="Results (top 500 or top 50 for search)", interactive=True, wrap=True)
    email_view = gr.HTML(label="Reader")

    # -------- State --------
    state_df          = gr.State()
    state_vec         = gr.State()
    state_X_reduced   = gr.State()
    state_index       = gr.State()
    state_term_names  = gr.State()
    state_query_terms = gr.State()
    state_use_lsa     = gr.State()
    state_use_faiss   = gr.State()
    state_svd         = gr.State()
    state_norm        = gr.State()
    state_dims        = gr.State()
    state_extra_terms = gr.State()
    state_highlight   = gr.State()
    state_inbox       = gr.State()   # NEW: keep last uploaded file for Update

        # -------- IO helpers --------
    def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
        recs: List[Dict[str, Any]] = []
        if local_path.endswith(".jsonl"):
            with open(local_path, "r", encoding="utf-8") as fh:
                for line in fh:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        obj = json.loads(line)
                    except Exception:
                        continue
                    if str(obj.get("type", "")).lower() == "meta":
                        continue
                    recs.append(obj)
        else:
            with open(local_path, "r", encoding="utf-8") as fh:
                obj = json.load(fh)
                if isinstance(obj, list):
                    for r in obj:
                        if str(r.get("type", "")).lower() == "meta":
                            continue
                        recs.append(r)
                elif isinstance(obj, dict):
                    if str(obj.get("type", "")).lower() != "meta":
                        recs = [obj]
        return recs

    def _apply_filters(
        df: pd.DataFrame,
        bucket: Optional[str],
        cluster: Optional[str],
        domain: Optional[str],
        sender: Optional[str],
        lang_value: str,
        sentiment: str,
        tag_value: str,
        start: str,
        end: str,
        hide_noise_flag: bool = False,
    ) -> pd.DataFrame:
        out = df
        if bucket and bucket != "(any)":
            out = out[out["bucket"] == bucket]
        if cluster and cluster != "(any)":
            m = re.match(r"^.*?(\-?\d+)\s+—", cluster)
            if m:
                cid = int(m.group(1))
                out = out[out["cluster_id"] == cid]
        if domain and domain != "(any)":
            out = out[out["from_domain"] == domain]
        if sender and sender != "(any)":
            out = out[out["from_email"] == sender]
        if lang_value and lang_value != "(any)":
            out = out[out["lang"] == lang_value]
        if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
            out = out[out["sentiment"].astype(str) == sentiment]
        if tag_value and tag_value != "(any)":
            out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))
                      | out["flags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
        if start:
            try:
                dt = pd.to_datetime(start, utc=True, errors="coerce")
                out = out[pd.to_datetime(out["date"], utc=True, errors="coerce") >= dt]
            except Exception:
                pass
        if end:
            try:
                dt = pd.to_datetime(end, utc=True, errors="coerce")
                out = out[pd.to_datetime(out["date"], utc=True, errors="coerce") <= dt]
            except Exception:
                pass
        if hide_noise_flag:
            out = out[out["cluster_id"] != -3]
        return out

    # -------- Social graph summary --------
    def social_stats(df: pd.DataFrame) -> pd.DataFrame:
        deg = {}
        def add_edge(a,b):
            if not a or not b or a==b: return
            deg.setdefault(a,set()).add(b)
            deg.setdefault(b,set()).add(a)
        for _, r in df.iterrows():
            f = r.get("from_email") or ""
            tos = r.get("to_emails") or []
            for t in tos:
                add_edge(f, t)
        rows=[]
        for addr, nbrs in deg.items():
            rows.append({"address": addr, "degree": len(nbrs)})
        out = pd.DataFrame(rows).sort_values("degree", ascending=False).head(50)
        return out

    # -------- Sorting helper --------
    def _sort_results(df, by, direction):
        if df is None or len(df) == 0:
            return pd.DataFrame()
        tmp = df.copy()
        if "date" in tmp.columns:
            tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
        else:
            tmp["_dt"] = pd.NaT
        by = by if by in tmp.columns else "context_anomaly_score"
        asc = (direction == "asc")
        sort_cols = [by]
        if by == "date":
            sort_cols = ["_dt"]
        elif by in ["anomaly_score", "corruption_score", "context_anomaly_score"]:
            sort_cols.append("_dt")
        tmp = tmp.sort_values(sort_cols, ascending=[asc, False])
        cols_out = [
            "date","bucket","from_email","from_domain","subject","cluster_name","lang",
            "tags","flags","sentiment","context_anomaly_score","corruption_score","anomaly_score"
        ]
        if "search_score" in tmp.columns:
            cols_out.append("search_score")
        return tmp[[c for c in cols_out if c in tmp.columns]].head(500)

    # -------- Vectorization helpers (mirror training path for queries) --------
    def _tokenize_query(q: str) -> List[str]:
        return [p.strip() for p in re.split(r"\s+", q or "") if p.strip()][:8]

    def _project_query_to_lsa(q_vec, svd, norm) -> Optional[np.ndarray]:
        try:
            return norm.transform(svd.transform(q_vec)).astype(np.float32)
        except Exception:
            return None

    def _vectorize_query(q, vec_state, corpus_texts):
        # Build the same features for the query that we used for docs
        char_min_df = 1 if len(corpus_texts) <= 1 else 2

        if vec_state.get("use_hashing"):
            hv = HashingVectorizer(
                analyzer="word",
                ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
                n_features=2 ** vec_state.get("hash_bits", 18),
                token_pattern=TOKEN_PATTERN,
                lowercase=True,
                norm=None,
                alternate_sign=False,
            )
            counts = hv.transform(corpus_texts)
            tfidf_tr = TfidfTransformer().fit(counts)
            q_word = tfidf_tr.transform(hv.transform([q]))
        else:
            cv = CountVectorizer(
                analyzer="word",
                ngram_range=(1, 2) if vec_state.get("use_bigrams") else (1, 1),
                max_features=vec_state.get("max_features"),
                min_df=vec_state.get("min_df"),
                max_df=vec_state.get("max_df"),
                token_pattern=TOKEN_PATTERN,
                lowercase=True,
                stop_words=STOPWORD_FOR_VEC,
                dtype=np.float32,
            )
            tf = cv.fit_transform(corpus_texts)
            bm25 = BM25Transformer().fit(tf)
            q_word = bm25.transform(cv.transform([q]))

        char_vec = CharTfidf(
            analyzer="char", ngram_range=(3, 5), min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32
        ).fit(corpus_texts)
        q_char = char_vec.transform([q])

        return hstack([q_word, q_char * 0.20], format="csr")

    # -------- Main pipeline --------
    def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
                     use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
                     trusted_domains_in, extra_keywords_in, highlight_toggle,
                     use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
                     per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
                     watchlist_in, min_mentions):
        if inbox_file is None:
            return (
                "**Please upload a file.**",
                None, None, None, None, None,
                None, None,  # surveillance outputs
                None,  # results_df
                None, None, None,  # states df/vec/X
                None, None,        # index & term names
                None, None,        # flags
                gr.update(), gr.update(), gr.update(), gr.update(),  # dropdowns
                None, None, None,  # svd/norm/dims
                None, None,        # extra terms / highlight
                gr.update()        # bucket list
            )

        # === Inner helpers for this function ===
        def _make_texts(df_in: pd.DataFrame) -> Tuple[List[str], List[str]]:
            texts = list(df_in.apply(enrich_text, axis=1))
            subjects_only = list(df_in["subject"].fillna(""))
            return texts, subjects_only

        def _vectorize_block(
            texts: List[str],
            use_bigrams: bool,
            max_features: int,
            min_df: int,
            max_df: float,
            use_hashing: bool,
            hash_bits: int
        ):
            """
            Return (X_full_csr, count_vec, char_vec, bm25, d_word, d_char, d_full)
            Uses Count+BM25 (+ char-tfidf) or Hashing+TfidfTransformer (+ char-tfidf).
            """
            n_docs = len(texts)
            ngram_range = (1, 2) if use_bigrams else (1, 1)
            char_min_df = 1 if n_docs <= 1 else 2

            if use_hashing:
                hv = HashingVectorizer(
                    analyzer="word", ngram_range=ngram_range,
                    n_features=2 ** int(hash_bits), alternate_sign=False,
                    token_pattern=TOKEN_PATTERN, lowercase=True, norm=None
                )
                word_counts = hv.transform(texts)
                tfidf_tr = TfidfTransformer()
                X_word = tfidf_tr.fit_transform(word_counts).astype(np.float32)

                char_vec = CharTfidf(
                    analyzer="char", ngram_range=(3, 5), min_df=char_min_df,
                    max_features=100_000, lowercase=True, dtype=np.float32
                )
                X_char = char_vec.fit_transform(texts)
                X_full = hstack([X_word, X_char * 0.20], format="csr")
                d_word, d_char, d_full = X_word.shape[1], X_char.shape[1], X_word.shape[1] + X_char.shape[1]
                count_vec = None; bm25 = None
                return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full

            count_vec = CountVectorizer(
                analyzer="word", ngram_range=ngram_range,
                max_features=int(max_features) if max_features else None,
                min_df=int(min_df) if min_df else 2, max_df=float(max_df) if max_df else 0.7,
                token_pattern=TOKEN_PATTERN, lowercase=True,
                dtype=np.float32, stop_words=STOPWORD_FOR_VEC
            )
            TF = count_vec.fit_transform(texts)
            bm25 = BM25Transformer(k1=1.2, b=0.75).fit(TF)
            X_word = bm25.transform(TF)

            char_vec = CharTfidf(
                analyzer="char", ngram_range=(3, 5), min_df=char_min_df,
                max_features=100_000, lowercase=True, dtype=np.float32
            )
            X_char = char_vec.fit_transform(texts)
            X_full = hstack([X_word, X_char * 0.20], format="csr")
            d_word, d_char, d_full = X_word.shape[1], X_char.shape[1], X_word.shape[1] + X_char.shape[1]
            return X_full, count_vec, char_vec, bm25, d_word, d_char, d_full

        def _reduce_space(X_full, use_lsa, lsa_dim):
            svd_obj = None
            norm_obj = None
            X_reduced = None
            if not use_lsa:
                return X_reduced, svd_obj, norm_obj

            n_docs = X_full.shape[0]
            n_feats = X_full.shape[1]
            max_components = max(1, min(n_docs, n_feats) - 1)
            n_comp = int(min(int(lsa_dim or 256), max_components))

            if n_comp < 2:
                return None, None, None

            svd_obj = TruncatedSVD(n_components=n_comp, random_state=0)
            Xtmp = svd_obj.fit_transform(X_full)  # dense
            norm_obj = Normalizer(copy=False)
            X_reduced = norm_obj.fit_transform(Xtmp).astype(np.float32)
            del Xtmp; gc.collect()
            return X_reduced, svd_obj, norm_obj

        def _attach_embeddings(texts, X_reduced_or_full, use_lsa, kv, emb_dim, weight):
            if kv is None or emb_dim <= 0 or weight <= 0.0:
                return X_reduced_or_full, emb_dim
            doc_embs = _build_doc_embeddings(texts, kv, emb_dim).astype(np.float32)
            if weight != 1.0:
                doc_embs *= float(weight)
            if isinstance(X_reduced_or_full, np.ndarray):
                return np.hstack([X_reduced_or_full, doc_embs]).astype(np.float32), emb_dim
            else:
                X_emb = csr_matrix(doc_embs)
                return hstack([X_reduced_or_full, X_emb], format="csr"), emb_dim

        def _cluster_space(
            X_space,
            bucket_name: str,
            df_part: pd.DataFrame,
            use_lsa: bool,
            use_hdbscan: bool,
            hdb_min_cluster: int,
            hdb_min_samples: int,
            auto_k: bool,
            k_clusters: int,
            mb_batch: int,
            count_vec,
            svd_obj,
            norm_obj,
            d_word, d_char
        ):
            n = X_space.shape[0]

            # Per-bucket stabilizer params
            min_size, merge_th, reassign_th = _bucket_stabilizer_params(bucket_name)

            if n <= 1:
                labels = np.zeros((n,), dtype=int) if n == 1 else np.array([], dtype=int)
                centers = None
                chosen_k = int(n) if n > 0 else 0
                return stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th), centers, chosen_k

            if n < 10:
                k_small = min(max(2, n // 2), n)
                kmeans = MiniBatchKMeans(
                    n_clusters=int(k_small),
                    batch_size=int(mb_batch or 4096),
                    random_state=0,
                    n_init="auto"
                )
                labels = kmeans.fit_predict(X_space)
                centers = getattr(kmeans, "cluster_centers_", None)
                labels = stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th)
                return labels, centers, int(len(set(labels)))

            if use_hdbscan and HDBSCAN_OK and isinstance(X_space, np.ndarray) and X_space.shape[0] >= max(50, hdb_min_cluster):
                min_samples = None if int(hdb_min_samples or 0) <= 0 else int(hdb_min_samples)
                clusterer = hdbscan.HDBSCAN(
                    min_cluster_size=int(hdb_min_cluster or 60),
                    min_samples=min_samples,
                    metric='euclidean',
                    cluster_selection_epsilon=0.0,
                    core_dist_n_jobs=1
                )
                labels = clusterer.fit_predict(X_space)
                centers = None
                labels = stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th)
                chosen_k = int(len(set([l for l in labels if l >= 0])))
                return labels, centers, chosen_k

            # Choose k (global rule or kneedle), then per-bucket multiplier
            if bool(auto_k):
                if use_lsa and isinstance(X_space, np.ndarray):
                    k, _ = choose_k_by_kneedle(X_space, ks=(50, 100, 150, 200, 300, 400, 500))
                else:
                    k = auto_k_rule(X_space.shape[0])
            else:
                k = max(10, int(k_clusters or 350))
            k = int(max(2, round(k * _bucket_k_multiplier(bucket_name))))

            k = min(k, n)

            init = None
            if use_lsa and isinstance(X_space, np.ndarray) and count_vec is not None:
                seeds = seeded_centroids_in_lsa(
                    CORR_LEX, count_vec, svd_obj.components_, norm_obj,
                    d_word=d_word, d_full=(d_word + d_char), k=k
                )
                if seeds is not None and seeds.shape[0] == k:
                    init = seeds

            kmeans = MiniBatchKMeans(
                n_clusters=k,
                batch_size=int(mb_batch or 4096),
                random_state=0,
                n_init="auto" if init is None else 1,
                init="k-means++" if init is None else init
            )
            labels = kmeans.fit_predict(X_space)
            centers = kmeans.cluster_centers_ if hasattr(kmeans, "cluster_centers_") else None
            if use_lsa and centers is not None:
                labels = merge_close_clusters(labels, centers, thresh=0.95)
            labels = stabilize_labels(X_space, labels, min_size=min_size, merge_thresh=merge_th, reassign_thresh=reassign_th)
            chosen_k = int(len(set(labels)))
            return labels, centers, chosen_k

        # ---- Begin processing ----
        trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
        extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
        extra_terms_lower = [t.lower() for t in extra_terms]

        # Handle Gradio file object
        try:
            infile_path = inbox_file.name
        except Exception:
            infile_path = str(inbox_file) if inbox_file else ""

        recs = _load_json_records(infile_path)
        if not recs:
            return ("**No valid records found.**",
                    None, None, None, None, None,
                    None, None,
                    None,
                    None, None, None,
                    None, None,
                    None, None, None, None,
                    None, None, None,
                    None, None,
                    None)

        normd = []
        for r in tqdm(recs, desc="Normalize", leave=False):
            out = normalize_email_record(r, use_langdetect=(not bool(skip_lang)))
            if out and out.get("body_text") is not None:
                normd.append(out)
        df = pd.DataFrame(normd)
        if df.empty:
            return ("**No usable email records.**",
                    None, None, None, None, None,
                    None, None,
                    None,
                    None, None, None,
                    None, None,
                    None, None, None, None,
                    None, None, None,
                    None, None,
                    None)

        df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
        df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
        df = compute_sentiment_column(df)

        # Stage-1 routing (bucketing)
        df["bucket"] = df.apply(route_email_row, axis=1)
        df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_domain", "")), axis=1)
        df["is_notify"] = df.apply(lambda r: is_notification_like(r.get("subject", ""), r.get("body_text", ""), r.get("from_email", ""), r.get("from_domain", "")), axis=1)
        df.loc[df["is_news"] == True, "bucket"]   = "Newsletters/Alerts"
        df.loc[df["is_notify"] == True, "bucket"] = "IT/Security"

        # Flags
        flags = []
        for _, row in df.iterrows():
            f = []
            h = _hour_of(row.get("date") or "")
            if h is not None and (h < 6 or h > 22):
                f.append("odd-hours")
            fd = (row.get("from_domain") or "").lower()
            if (fd in PERSONAL_DOMAINS) and (fd not in trusted):
                f.append("personal-mail")
            f += _attachment_flags(row.get("attachments") or [])
            flags.append(f)
        df["flags"] = flags

        # Split out stable buckets
        df_main = df[~df["bucket"].isin(["Newsletters/Alerts", "IT/Security"])].reset_index(drop=True)
        df_news = df[df["bucket"] == "Newsletters/Alerts"].reset_index(drop=True)
        df_alerts = df[df["bucket"] == "IT/Security"].reset_index(drop=True)

        # Optional embeddings
        kv = None
        emb_dim = 0
        if bool(use_embeddings):
            kv, emb_dim = _load_embeddings(embeddings_path or "", bool(embeddings_binary))

        # Build partitions: per language within bucket if requested
        parts = []
        if bool(per_language):
            for bkt, g_bucket in df_main.groupby("bucket", dropna=False):
                for lang_code, grp in g_bucket.groupby("lang", dropna=False):
                    parts.append(((bkt, lang_code), grp.copy()))
        else:
            for bkt, grp in df_main.groupby("bucket", dropna=False):
                parts.append(((bkt, "all"), grp.copy()))

        labels_list, cluster_name_list, anomaly_list = [], [], []
        bucket_indexers = []
        X_reduced_holder = None
        term_names_global = {}
        single_partition = (len(parts) == 1)
        d_word_agg, d_char_agg = 0, 0
        svd_obj_local, norm_obj_local = None, None

        for (bucket_name, _lang), df_part in parts:
            if df_part.empty:
                continue
            texts, subjects_only = _make_texts(df_part)

            X_full, count_vec, char_vec, _, d_word, d_char, _ = _vectorize_block(
                texts=texts,
                use_bigrams=bool(use_bigrams),
                max_features=int(max_features or 120000),
                min_df=int(min_df or 3),
                max_df=float(max_df or 0.7),
                use_hashing=bool(use_hashing),
                hash_bits=int(hash_bits or 18),
            )
            d_word_agg += d_word
            d_char_agg += d_char

            X_reduced, svd_obj_local, norm_obj_local = _reduce_space(X_full, bool(use_lsa), int(lsa_dim or 256))
            X_space = (X_reduced if X_reduced is not None else X_full)

            if kv:
                X_space, _ = _attach_embeddings(
                    texts, X_space, bool(use_lsa) and X_reduced is not None, kv, emb_dim, float(embed_weight)
                )

            anomaly_scores = np.full((len(df_part),), np.nan, dtype=np.float32)
            if X_reduced is not None and bool(use_iso) and ISO_OK and X_reduced.shape[0] >= 50:
                try:
                    iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0).fit(X_reduced)
                    anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32)
                except Exception:
                    pass

            labels, _, _ = _cluster_space(
                X_space=X_space,
                bucket_name=bucket_name,
                df_part=df_part,
                use_lsa=bool(use_lsa) and X_reduced is not None,
                use_hdbscan=bool(use_hdbscan),
                hdb_min_cluster=int(hdb_min_cluster or 60),
                hdb_min_samples=int(hdb_min_samples or 0),
                auto_k=bool(auto_k),
                k_clusters=int(k_clusters or 350),
                mb_batch=int(mb_batch or 4096),
                count_vec=count_vec,
                svd_obj=svd_obj_local,
                norm_obj=norm_obj_local,
                d_word=d_word,
                d_char=d_char,
            )

            term_names = cluster_labels_pmi_bigram(
                texts=texts, labels=labels, subjects=subjects_only,
                topn=6, subject_alpha=0.75, global_ubiq_cut=0.20, subject_min_cov=0.30
            )

            bucket_indexers.append(df_part.index)
            labels_list.append(pd.Series(labels, index=df_part.index))
            cluster_name_list.append(pd.Series([term_names.get(int(c), "noise" if int(c) < 0 else f"cluster_{int(c)}") for c in labels], index=df_part.index))
            anomaly_list.append(pd.Series(anomaly_scores, index=df_part.index))
            term_names_global.update({int(k): v for k, v in term_names.items()})

            if single_partition and X_reduced is not None:
                X_reduced_holder = X_reduced

        if labels_list:
            df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_id"]   = pd.concat(labels_list).sort_index()
            df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "cluster_name"] = pd.concat(cluster_name_list).sort_index()
            df_main.loc[pd.Index(np.concatenate(bucket_indexers)), "anomaly_score"] = pd.concat(anomaly_list).sort_index()
        else:
            df_main["cluster_id"] = -10
            df_main["cluster_name"] = "unclustered"
            df_main["anomaly_score"] = np.nan

        # Assign fixed ids for news/alerts buckets
        if len(df_news):
            df_news.loc[:, "cluster_id"] = -1
            df_news.loc[:, "cluster_name"] = "newsletter/news"
            df_news.loc[:, "anomaly_score"] = np.nan
        if len(df_alerts):
            df_alerts.loc[:, "cluster_id"] = -2
            df_alerts.loc[:, "cluster_name"] = "system/alerts"
            df_alerts.loc[:, "anomaly_score"] = np.nan

        # Merge back
        df = pd.concat([df_main, df_news, df_alerts], ignore_index=True)

        # Label de-dup pass per-bucket
        df = dedupe_all_labels(df)

        # Scores
        df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
        df = compute_context_anomaly(df)

        # Surveillance campaigns
        wl = [w.strip() for w in (watchlist_in or "").split(",") if w.strip()]
        ent_df, samp_df = detect_surveillance_campaigns(df, watchlist=wl, min_mentions=int(min_mentions or 15))
        df = tag_surveillance_emails(df, ent_df, threshold=4.5)

        # Build indexes/search
        index_obj = None
        use_faiss_flag = bool(use_faiss) and FAISS_OK and bool(use_lsa) and (X_reduced_holder is not None) and single_partition
        if use_faiss_flag:
            d = X_reduced_holder.shape[1]
            index_obj = faiss.IndexFlatIP(d)
            index_obj.add(X_reduced_holder)
        else:
            try:
                if bool(use_lsa) and X_reduced_holder is not None and single_partition:
                    nn = NearestNeighbors(metric="cosine", algorithm="brute").fit(X_reduced_holder)
                    index_obj = nn
                else:
                    index_obj = NearestNeighbors(metric="cosine", algorithm="brute").fit(np.zeros((1, 4), dtype=np.float32))
            except Exception:
                pass

        # Summaries
        cluster_counts = (
            df.groupby(["bucket", "cluster_id", "cluster_name"])
              .size()
              .reset_index(name="count")
              .sort_values("count", ascending=False)
              .head(500)
        )
        cluster_counts["label"] = cluster_counts.apply(
            lambda r: f'{r["bucket"]} — {int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
        )
        cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
        bucket_choices = ["(any)"] + sorted(df["bucket"].dropna().unique().tolist())

        domain_counts = (
            df.groupby("from_domain").size().reset_index(name="count").sort_values("count", ascending=False).head(100)
        )
        domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
        sender_counts = (
            df.groupby("from_email").size().reset_index(name="count").sort_values("count", ascending=False).head(200)
        )
        sender_choices = ["(any)"] + sender_counts["from_email"].tolist()
        langs = sorted([l for l in df["lang"].dropna().unique() if l and l != "unknown"])
        lang_choices = ["(any)"] + langs
        actors = social_stats(df)
        offp = df[df["flags"].apply(lambda xs: "odd-hours" in xs or "personal-mail" in xs)]
        offhours_table = (
            offp[["date", "from_email", "subject", "flags", "corruption_score"]]
            .sort_values("corruption_score", ascending=False)
            .head(200)
        )
        out_table = _sort_results(df, "context_anomaly_score", "desc")

        vec_state = {
            "use_hashing": bool(use_hashing),
            "hash_bits": int(hash_bits),
            "max_features": int(max_features),
            "min_df": int(min_df),
            "max_df": float(max_df),
            "use_bigrams": bool(use_bigrams),
        }
        status_md = f"**Processed {len(df):,} emails** | clusters ~ {len(cluster_counts):,} (showing top 500)"

        svd_obj_out = svd_obj_local if single_partition else None
        norm_obj_out = norm_obj_local if single_partition else None

        return (
            status_md,                  # status
            cluster_counts, domain_counts, sender_counts,  # summaries
            actors, offhours_table,                        # extra summaries
            ent_df, samp_df,                               # surveillance tables
            out_table,                                     # results table
            df, vec_state, X_reduced_holder,               # states
            index_obj, term_names_global,                  # index + labels
            bool(use_lsa), use_faiss_flag,                 # flags
            gr.update(choices=cluster_choices, value="(any)"),
            gr.update(choices=domain_choices,  value="(any)"),
            gr.update(choices=sender_choices,  value="(any)"),
            gr.update(choices=lang_choices, value="(any)"),
            svd_obj_out, norm_obj_out, (d_word_agg, d_char_agg),
            extra_terms_lower, bool(highlight_toggle),
            gr.update(choices=bucket_choices, value="(any)")
        )

    # Bind Process button
    (run_btn.click)(
        process_file,
        inputs=[
            inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
            use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
            trusted_domains_in, extra_keywords_in, highlight_toggle,
            use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
            per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
            watchlist_in, min_mentions
        ],
        outputs=[
            status, cluster_counts_df, domain_counts_df, sender_counts_df,
            actors_df, offhours_df,
            surv_entities_df, surv_samples_df,
            results_df,
            state_df, state_vec, state_X_reduced,
            state_index, state_term_names,
            state_use_lsa, state_use_faiss,
            cluster_drop, domain_drop, sender_drop, lang_drop,
            state_svd, state_norm, state_dims,
            state_extra_terms, state_highlight,
            bucket_drop,
        ],
    ).then(
        # remember the uploaded file for future "Update" runs
        lambda f: f, inputs=[inbox_file], outputs=[state_inbox]
    )

    # Keep state_inbox in sync whenever a new file is uploaded
    inbox_file.change(lambda f: f, inputs=[inbox_file], outputs=[state_inbox])

    # NEW: Bind Update button — re-run with the last uploaded file + current settings
    update_btn.click(
        process_file,
        inputs=[
            state_inbox, max_features, min_df, max_df, use_bigrams, skip_lang,
            use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
            trusted_domains_in, extra_keywords_in, highlight_toggle,
            use_hashing, hash_bits, use_hdbscan, hdb_min_cluster, hdb_min_samples,
            per_language, use_embeddings, embed_weight, embeddings_path, embeddings_binary,
            watchlist_in, min_mentions
        ],
        outputs=[
            status, cluster_counts_df, domain_counts_df, sender_counts_df,
            actors_df, offhours_df,
            surv_entities_df, surv_samples_df,
            results_df,
            state_df, state_vec, state_X_reduced,
            state_index, state_term_names,
            state_use_lsa, state_use_faiss,
            cluster_drop, domain_drop, sender_drop, lang_drop,
            state_svd, state_norm, state_dims,
            state_extra_terms, state_highlight,
            bucket_drop,
        ],
    )

    # -------- Filtering & Search --------
    def refresh_results(df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, sort_by, sort_dir, hide_noise_flag):
        if df is None or len(df) == 0:
            return pd.DataFrame()
        filt = _apply_filters(
            df, bucket, cluster, domain, sender, lang, sentiment, tag, start, end, hide_noise_flag=bool(hide_noise_flag)
        )
        return _sort_results(filt, sort_by, sort_dir)

    # Re-run when any filter control changes
    for ctrl in [bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
                 date_start, date_end, sort_by, sort_dir, hide_noise]:
        ctrl.change(
            refresh_results,
            inputs=[
                state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
                sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
            ],
            outputs=[results_df],
        )

    # Reset filters
    reset_btn.click(
        lambda: ["(any)"] * 7 + [""] * 2 + ["context_anomaly_score", "desc"] + [True],
        [],
        [bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop,
         date_start, date_end, sort_by, sort_dir, hide_noise],
    ).then(
        refresh_results,
        inputs=[
            state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
            sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise,
        ],
        outputs=[results_df],
    )

    # --- Search ---
    def search_fn(q, df, vec, X_red, index, use_lsa, use_faiss, svd, norm, sort, sdir):
        if not q or df is None or vec is None or index is None:
            return pd.DataFrame(), []

        # Search ignores newsletters/alerts/noise by default
        mask = ~df["cluster_id"].isin([-1, -2, -3])
        df_main = df[mask].reset_index(drop=True)
        if df_main.empty:
            return pd.DataFrame(), []

        q_terms = _tokenize_query(q)
        q_vec = _vectorize_query(q, vec, list(df_main.apply(enrich_text, axis=1)))

        q_emb = _project_query_to_lsa(q_vec, svd, norm) if use_lsa and svd is not None and norm is not None else q_vec
        if q_emb is None:
            return pd.DataFrame(), q_terms

        n_req = min(50, len(df_main))
        if n_req <= 0:
            return pd.DataFrame(), q_terms

        if isinstance(index, NearestNeighbors):
            if hasattr(index, "n_samples_fit_") and index.n_samples_fit_ <= 1:
                return pd.DataFrame(), q_terms
            dists, inds = index.kneighbors(q_emb, n_neighbors=n_req)
            sims = 1.0 - dists[0]
            results = df_main.iloc[inds[0]].copy()
            results["search_score"] = sims
        elif use_faiss and FAISS_OK and hasattr(index, "search"):
            D, I = index.search(q_emb.astype(np.float32), k=n_req)
            results = df_main.iloc[I[0]].copy()
            results["search_score"] = D[0]
        else:
            return pd.DataFrame(), q_terms

        return _sort_results(results, sort, sdir), q_terms

    search_btn.click(
        search_fn,
        inputs=[
            search_query, state_df, state_vec, state_X_reduced, state_index,
            state_use_lsa, state_use_faiss, state_svd, state_norm, sort_by, sort_dir,
        ],
        outputs=[results_df, state_query_terms],
    )

    # --- Reader selection (highlighting) ---
    def on_row_select(evt: gr.SelectData, table, df, term_names, q_terms, extra_terms, do_highlight):
        if evt.index is None or table is None or len(table) == 0 or df is None or len(df) == 0:
            return ""
        row_idx = evt.index[0]
        sel = table.iloc[row_idx]

        # Try to match the original row
        cand = df[
            (df["subject"] == sel.get("subject"))
            & (df["from_email"] == sel.get("from_email"))
            & (df["date"] == sel.get("date"))
        ]
        if cand.empty:
            cand = df[df["subject"] == sel.get("subject")]
        if cand.empty:
            return "Could not find original record."

        row = cand.iloc[0]
        cid = int(row.get("cluster_id", -99))
        clabel = term_names.get(cid, row.get("cluster_name")) if term_names else row.get("cluster_name")
        return build_highlighted_html(
            row,
            query_terms=q_terms,
            cluster_label=f'{row.get("bucket","Other")} / {clabel}',
            do_highlight=do_highlight,
            extra_terms=extra_terms,
        )

    results_df.select(
        on_row_select,
        inputs=[results_df, state_df, state_term_names, state_query_terms, state_extra_terms, state_highlight],
        outputs=[email_view],
    )

    # --- Click-to-filter helpers ---
    def on_click_filter(evt: gr.SelectData, df_sum: pd.DataFrame, col_name: str, out_comp: gr.Dropdown):
        if evt.index is None or df_sum is None or df_sum.empty:
            return gr.update()
        val = df_sum.iloc[evt.index[0]][col_name]
        return gr.update(value=val)
    
    def on_cluster_summary_select(evt: gr.SelectData, df_sum: pd.DataFrame):
        if evt.index is None or df_sum is None or df_sum.empty:
            return gr.update(), gr.update()
        r = df_sum.iloc[evt.index[0]]
        return gr.update(value=r["bucket"]), gr.update(value=r["label"])

    cluster_counts_df.select(
        on_cluster_summary_select, [cluster_counts_df], [bucket_drop, cluster_drop]
    ).then(
        refresh_results,
        inputs=[
            state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
            sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
        ],
        outputs=[results_df],
    )

    domain_counts_df.select(
        lambda evt, df: on_click_filter(evt, df, "from_domain", domain_drop), [domain_counts_df], [domain_drop]
    ).then(
        refresh_results,
        inputs=[
            state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
            sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
        ],
        outputs=[results_df],
    )

    sender_counts_df.select(
        lambda evt, df: on_click_filter(evt, df, "from_email", sender_drop), [sender_counts_df], [sender_drop]
    ).then(
        refresh_results,
        inputs=[
            state_df, bucket_drop, cluster_drop, domain_drop, sender_drop, lang_drop,
            sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir, hide_noise
        ],
        outputs=[results_df],
    )

if __name__ == "__main__":
    # Disable SSR to avoid handler arity warnings under server-side rendering
    demo.launch(ssr_mode=False)