Update app.py
Browse files
app.py
CHANGED
|
@@ -25,6 +25,13 @@ from sklearn.preprocessing import Normalizer
|
|
| 25 |
from sklearn.preprocessing import normalize as sk_normalize
|
| 26 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
from scipy.sparse import hstack
|
| 29 |
|
| 30 |
# Optional fast ANN (CPU)
|
|
@@ -34,7 +41,7 @@ try:
|
|
| 34 |
except Exception:
|
| 35 |
FAISS_OK = False
|
| 36 |
|
| 37 |
-
# Optional
|
| 38 |
try:
|
| 39 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 40 |
VADER_OK = True
|
|
@@ -42,44 +49,59 @@ except Exception:
|
|
| 42 |
VADER_OK = False
|
| 43 |
|
| 44 |
# =================== Regex & Flags ===================
|
| 45 |
-
# Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
|
| 46 |
TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
|
| 47 |
-
|
| 48 |
-
# URLs -> "URL" (reduce feature bloat).
|
| 49 |
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
|
| 50 |
-
|
| 51 |
-
# Quote lines ("> ...")
|
| 52 |
QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
|
| 53 |
-
|
| 54 |
-
# Signature separator: lines after "-- " (standard)
|
| 55 |
SIG_RE = re.compile(r"\n-- ?\n", re.M)
|
| 56 |
-
|
| 57 |
-
# Device footers
|
| 58 |
SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
|
| 59 |
HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
|
| 60 |
-
|
| 61 |
-
# Forward/quoted markers
|
| 62 |
FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
|
| 63 |
FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
|
| 64 |
ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
|
| 65 |
|
| 66 |
-
# Toggle for language detection (skip for speed)
|
| 67 |
SKIP_LANGDETECT = True
|
| 68 |
|
| 69 |
-
#
|
| 70 |
SUSPECT_PHRASES = [
|
| 71 |
-
|
| 72 |
-
"
|
| 73 |
-
"
|
| 74 |
-
"
|
| 75 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
]
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
# Entity regexes
|
| 79 |
MONEY_RE = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
|
| 80 |
PHONE_RE = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
|
| 81 |
-
INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order)\b', re.I)
|
| 82 |
COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
# Optional seeded themes for semi-supervised init (used only when LSA is ON)
|
| 85 |
CORR_LEX = {
|
|
@@ -89,19 +111,15 @@ CORR_LEX = {
|
|
| 89 |
"money_flow" : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
|
| 90 |
}
|
| 91 |
|
| 92 |
-
# =================== Label cleanup helpers ===================
|
| 93 |
EN_STOP = {
|
| 94 |
"the","of","and","to","in","is","for","on","at","with","from","by","or","as",
|
| 95 |
"that","this","it","be","are","was","were","an","a","you","your","we","our","us",
|
| 96 |
"re","fwd","fw","hi","hello","thanks","thank","regards","best","please","dear","mr","mrs",
|
| 97 |
"message","original","forwarded","attached","attachment","confidential","notice","disclaimer",
|
| 98 |
-
"herein","thereof","hereby","therein","regarding","subject","url","via","kind","regard",
|
| 99 |
-
"ny"
|
| 100 |
-
}
|
| 101 |
-
HE_STOP = {
|
| 102 |
-
"של","על","זה","גם","אם","לא","את","אתה","אני","הוא","היא","הם","הן","כי","מה",
|
| 103 |
-
"שלום","תודה","בברכה","מצורף","הודעה","קדימה","היי"
|
| 104 |
}
|
|
|
|
| 105 |
MONTHS = {
|
| 106 |
"jan","feb","mar","apr","may","jun","jul","aug","sep","sept","oct","nov","dec",
|
| 107 |
"january","february","march","april","june","july","august","september",
|
|
@@ -116,14 +134,10 @@ def _is_junk_term(t: str) -> bool:
|
|
| 116 |
tl = t.lower()
|
| 117 |
if tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
|
| 118 |
return True
|
| 119 |
-
if EMAIL_LIKE_RE.search(tl):
|
| 120 |
-
|
| 121 |
-
if
|
| 122 |
-
|
| 123 |
-
if NUMERIC_RE.match(tl):
|
| 124 |
-
return True
|
| 125 |
-
if ONE_CHAR_RE.match(tl):
|
| 126 |
-
return True
|
| 127 |
return False
|
| 128 |
|
| 129 |
def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
|
|
@@ -147,7 +161,7 @@ def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarra
|
|
| 147 |
break
|
| 148 |
return cleaned
|
| 149 |
|
| 150 |
-
# =================== HTML/Text
|
| 151 |
def html_to_text(html: str) -> str:
|
| 152 |
if not html:
|
| 153 |
return ""
|
|
@@ -183,10 +197,19 @@ def parse_name_email(s: str) -> Tuple[str, str]:
|
|
| 183 |
return (m.group(1) or "").strip(), (m.group(2) or "").strip()
|
| 184 |
return "", s.strip()
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
|
| 187 |
headers: Dict[str, str] = {}
|
| 188 |
lines = (text or "").splitlines()
|
| 189 |
-
header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject):')
|
| 190 |
i = 0
|
| 191 |
saw_header = False
|
| 192 |
while i < len(lines):
|
|
@@ -234,15 +257,27 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
|
|
| 234 |
if str(raw.get("type", "")).lower() == "meta":
|
| 235 |
return {}
|
| 236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
body_text_raw = raw.get("body_text") or raw.get("text") or ""
|
| 238 |
html_content = raw.get("body_html") or raw.get("html") or ""
|
| 239 |
if html_content and not body_text_raw:
|
| 240 |
body_text_raw = html_to_text(html_content)
|
| 241 |
-
|
| 242 |
body_text_raw = ftfy.fix_text(body_text_raw or "")
|
| 243 |
|
| 244 |
subject_text = ""
|
| 245 |
from_name = from_email = from_domain = ""
|
|
|
|
| 246 |
date_val = raw.get("date") or raw.get("Date") or ""
|
| 247 |
|
| 248 |
if body_text_raw:
|
|
@@ -250,6 +285,8 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
|
|
| 250 |
subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
|
| 251 |
sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
|
| 252 |
date_val = headers.get("Date", "") or date_val
|
|
|
|
|
|
|
| 253 |
|
| 254 |
body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
|
| 255 |
body_clean = URL_RE.sub(" URL ", body_clean)
|
|
@@ -267,6 +304,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
|
|
| 267 |
sender = raw.get("from") or raw.get("From") or ""
|
| 268 |
from_name, from_email = parse_name_email(sender)
|
| 269 |
from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
|
|
|
|
| 270 |
|
| 271 |
subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
|
| 272 |
|
|
@@ -302,9 +340,11 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
|
|
| 302 |
"from_name": from_name,
|
| 303 |
"from_email": from_email,
|
| 304 |
"from_domain": from_domain,
|
|
|
|
| 305 |
"subject": subject_norm,
|
| 306 |
"body_text": body_text,
|
| 307 |
"lang": lang,
|
|
|
|
| 308 |
"text_hash": text_hash,
|
| 309 |
}
|
| 310 |
|
|
@@ -322,6 +362,8 @@ def has_suspect_tag(text: str) -> List[str]:
|
|
| 322 |
if "wire" in low or "transfer" in low or "cash" in low:
|
| 323 |
if "finance" not in tags:
|
| 324 |
tags.append("finance")
|
|
|
|
|
|
|
| 325 |
return tags
|
| 326 |
|
| 327 |
def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -337,19 +379,60 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 337 |
df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
|
| 338 |
return df
|
| 339 |
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
subject = (row.get("subject") or "").strip()
|
| 342 |
body = (row.get("body_text") or "").strip()
|
| 343 |
from_email = row.get("from_email") or ""
|
| 344 |
date = row.get("date") or ""
|
| 345 |
tags = row.get("tags") or []
|
|
|
|
| 346 |
sentiment = row.get("sentiment") or "(unknown)"
|
| 347 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
def hi(text: str) -> str:
|
| 349 |
-
if not text or not
|
| 350 |
return text
|
| 351 |
out = text
|
| 352 |
-
for qt in
|
| 353 |
if not qt:
|
| 354 |
continue
|
| 355 |
try:
|
|
@@ -366,9 +449,17 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
|
|
| 366 |
dir_attr = ' dir="rtl"' if rtl else ""
|
| 367 |
body_html = body_h.replace("\n", "<br/>")
|
| 368 |
|
|
|
|
|
|
|
|
|
|
| 369 |
tag_html = ""
|
|
|
|
| 370 |
if isinstance(tags, list) and tags:
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
|
| 374 |
|
|
@@ -401,9 +492,7 @@ class BM25Transformer:
|
|
| 401 |
self.avgdl_ = None
|
| 402 |
|
| 403 |
def fit(self, X):
|
| 404 |
-
# X is term-frequency (CountVectorizer)
|
| 405 |
N = X.shape[0]
|
| 406 |
-
# document frequency per term
|
| 407 |
df = np.bincount(X.tocsc().indices, minlength=X.shape[1]).astype(np.float64)
|
| 408 |
self.idf_ = np.log((N - df + 0.5) / (df + 0.5 + 1e-12))
|
| 409 |
dl = np.asarray(X.sum(axis=1)).ravel()
|
|
@@ -423,7 +512,6 @@ class BM25Transformer:
|
|
| 423 |
data[i] = (self.idf_[cols[i]] * (tf * (k1 + 1))) / (denom + 1e-12)
|
| 424 |
return X
|
| 425 |
|
| 426 |
-
# Add enrichment tokens to help the model lock onto key signals
|
| 427 |
def enrich_text(row: pd.Series) -> str:
|
| 428 |
subj = row.get("subject","") or ""
|
| 429 |
body = row.get("body_text","") or ""
|
|
@@ -433,6 +521,7 @@ def enrich_text(row: pd.Series) -> str:
|
|
| 433 |
if PHONE_RE.search(t): tokens.append("__HAS_PHONE__")
|
| 434 |
if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
|
| 435 |
if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
|
|
|
|
| 436 |
return (t + " " + " ".join(tokens)).strip()
|
| 437 |
|
| 438 |
# =================== Cluster labeling: PMI bigrams ===================
|
|
@@ -466,7 +555,7 @@ def cluster_labels_pmi_bigram(texts, labels, topn=6):
|
|
| 466 |
labels_out[c] = ", ".join(top) if top else f"cluster_{c}"
|
| 467 |
return labels_out
|
| 468 |
|
| 469 |
-
# =================== Auto-k
|
| 470 |
def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
|
| 471 |
n = X.shape[0]
|
| 472 |
if n > 40000:
|
|
@@ -490,10 +579,8 @@ def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
|
|
| 490 |
return k_best, dict(zip(ks, inertias))
|
| 491 |
|
| 492 |
def auto_k_rule(n_docs: int) -> int:
|
| 493 |
-
# Sublinear scaling; keeps clusters between ~120 and 600 for big corpora
|
| 494 |
return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
|
| 495 |
|
| 496 |
-
# =================== Merge close clusters (LSA space only to save RAM) ===================
|
| 497 |
def merge_close_clusters(labels, centers, thresh=0.92):
|
| 498 |
centers = sk_normalize(centers)
|
| 499 |
sim = cosine_similarity(centers, centers)
|
|
@@ -517,11 +604,9 @@ def merge_close_clusters(labels, centers, thresh=0.92):
|
|
| 517 |
labels2 = np.array([idmap[root[int(c)]] for c in labels], dtype=int)
|
| 518 |
return labels2
|
| 519 |
|
| 520 |
-
# =================== Seeded centroids (only if LSA enabled) ===================
|
| 521 |
def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVectorizer,
|
| 522 |
lsa_components: np.ndarray, norm_obj: Normalizer,
|
| 523 |
d_word: int, d_full: int, k: int) -> Optional[np.ndarray]:
|
| 524 |
-
# Build a few unit vectors in word-term space based on lexicons
|
| 525 |
seeds_word = []
|
| 526 |
vocab = count_vec.vocabulary_
|
| 527 |
for _, words in lexicons.items():
|
|
@@ -536,30 +621,45 @@ def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVect
|
|
| 536 |
seeds_word.append(v)
|
| 537 |
if not seeds_word:
|
| 538 |
return None
|
| 539 |
-
# Lift to full feature space (word + char) by padding zeros for char dims
|
| 540 |
seeds_full = []
|
| 541 |
for v in seeds_word:
|
| 542 |
vf = np.zeros((d_full,), dtype=np.float32)
|
| 543 |
vf[:d_word] = v
|
| 544 |
seeds_full.append(vf)
|
| 545 |
-
seeds_full = np.stack(seeds_full, axis=0)
|
| 546 |
-
|
| 547 |
-
seeds_red = seeds_full @ lsa_components.T # (s, lsa_dim)
|
| 548 |
seeds_red = norm_obj.transform(seeds_red.astype(np.float32))
|
| 549 |
-
# If fewer than k seeds, KMeans will accept; scikit-learn requires init shape == (k, d)
|
| 550 |
-
# We’ll return only if seeds count >= 2 to be meaningful; otherwise None
|
| 551 |
if seeds_red.shape[0] >= 2 and seeds_red.shape[0] <= k:
|
| 552 |
return seeds_red
|
| 553 |
return None
|
| 554 |
|
| 555 |
-
# ===================
|
| 556 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
score = 0.0
|
| 558 |
txt = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
|
| 559 |
for ph in SUSPECT_PHRASES:
|
| 560 |
if ph in txt:
|
| 561 |
score += 2.0
|
| 562 |
break
|
|
|
|
|
|
|
| 563 |
if isinstance(row.get("tags"), list) and ("🚩suspect" in row["tags"] or "finance" in row["tags"]):
|
| 564 |
score += 1.5
|
| 565 |
if MONEY_RE.search(txt): score += 0.7
|
|
@@ -569,6 +669,14 @@ def corruption_score(row):
|
|
| 569 |
body_len = len(row.get("body_text",""))
|
| 570 |
if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
|
| 571 |
score += 0.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
return score
|
| 573 |
|
| 574 |
# =================== Gradio UI ===================
|
|
@@ -587,13 +695,11 @@ CSS = """
|
|
| 587 |
mark { background:#fff59d; color:#111827; padding:0 2px; border-radius:2px; }
|
| 588 |
hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
|
| 589 |
.small { color:#475569; font-size:12px; }
|
|
|
|
| 590 |
"""
|
| 591 |
|
| 592 |
with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
|
| 593 |
-
gr.Markdown(""
|
| 594 |
-
# Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans
|
| 595 |
-
**Goal:** quickly surface potentially corruption-related emails via topic clusters, tags, corruption score, and sentiment.
|
| 596 |
-
""")
|
| 597 |
|
| 598 |
with gr.Row():
|
| 599 |
inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
|
|
@@ -613,21 +719,25 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 613 |
mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
|
| 614 |
with gr.Row():
|
| 615 |
use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
|
|
|
|
| 616 |
|
| 617 |
-
with gr.Accordion("
|
| 618 |
with gr.Row():
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)"
|
| 623 |
-
)
|
| 624 |
with gr.Row():
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
)
|
|
|
|
|
|
|
|
|
|
| 628 |
with gr.Row():
|
| 629 |
date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
|
| 630 |
date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
|
|
|
|
|
|
|
| 631 |
|
| 632 |
with gr.Row():
|
| 633 |
run_btn = gr.Button("Process", variant="primary")
|
|
@@ -635,9 +745,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 635 |
status = gr.Markdown("")
|
| 636 |
|
| 637 |
with gr.Row():
|
| 638 |
-
cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
|
| 639 |
domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
|
| 640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
gr.Markdown("### Search")
|
| 642 |
with gr.Row():
|
| 643 |
search_query = gr.Textbox(label="Search (keywords, names, etc.)")
|
|
@@ -646,17 +760,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 646 |
email_view = gr.HTML(label="Reader")
|
| 647 |
|
| 648 |
# State
|
| 649 |
-
state_df = gr.State()
|
| 650 |
-
state_vec = gr.State()
|
| 651 |
-
state_X_reduced = gr.State()
|
| 652 |
-
state_index = gr.State()
|
| 653 |
-
state_term_names = gr.State()
|
| 654 |
-
state_query_terms = gr.State()
|
| 655 |
state_use_lsa = gr.State()
|
| 656 |
state_use_faiss = gr.State()
|
| 657 |
state_svd = gr.State()
|
| 658 |
state_norm = gr.State()
|
| 659 |
-
state_dims = gr.State()
|
|
|
|
|
|
|
| 660 |
|
| 661 |
# -------- IO helpers --------
|
| 662 |
def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
|
|
@@ -691,6 +807,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 691 |
df: pd.DataFrame,
|
| 692 |
cluster: Optional[str],
|
| 693 |
domain: Optional[str],
|
|
|
|
|
|
|
| 694 |
sentiment: str,
|
| 695 |
tag_value: str,
|
| 696 |
start: str,
|
|
@@ -704,10 +822,15 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 704 |
out = out[out["cluster_id"] == cid]
|
| 705 |
if domain and domain != "(any)":
|
| 706 |
out = out[out["from_domain"] == domain]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 707 |
if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
|
| 708 |
out = out[out["sentiment"].astype(str) == sentiment]
|
| 709 |
if tag_value and tag_value != "(any)":
|
| 710 |
-
out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))
|
|
|
|
| 711 |
if start:
|
| 712 |
try:
|
| 713 |
dt = pd.to_datetime(start, utc=True, errors="coerce")
|
|
@@ -722,19 +845,47 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 722 |
pass
|
| 723 |
return out
|
| 724 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 725 |
# -------- Main pipeline --------
|
| 726 |
def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 727 |
-
use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss
|
|
|
|
| 728 |
if inbox_file is None:
|
| 729 |
return ("**Please upload a file.**",
|
| 730 |
-
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
|
|
|
|
| 731 |
|
| 732 |
use_lang = not bool(skip_lang)
|
| 733 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 734 |
recs = _load_json_records(inbox_file.name)
|
| 735 |
if not recs:
|
| 736 |
return ("**No valid records found.**",
|
| 737 |
-
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
|
|
|
|
| 738 |
|
| 739 |
# Normalize
|
| 740 |
normd = []
|
|
@@ -745,19 +896,35 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 745 |
df = pd.DataFrame(normd)
|
| 746 |
if df.empty:
|
| 747 |
return ("**No usable email records after normalization.**",
|
| 748 |
-
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
|
|
|
|
| 749 |
|
| 750 |
# Deduplicate conservatively
|
| 751 |
df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
|
| 752 |
|
| 753 |
-
# Tags
|
| 754 |
df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
|
| 755 |
df = compute_sentiment_column(df)
|
| 756 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 757 |
# Enriched texts (adds __HAS_*__ flags)
|
| 758 |
texts = list(df.apply(enrich_text, axis=1))
|
| 759 |
|
| 760 |
-
# === Vectorization
|
| 761 |
ngram_range = (1, 2) if use_bigrams else (1, 1)
|
| 762 |
count_vec = CountVectorizer(
|
| 763 |
analyzer="word",
|
|
@@ -784,42 +951,49 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 784 |
d_char = X_char.shape[1]
|
| 785 |
d_full = X_full.shape[1]
|
| 786 |
|
| 787 |
-
# LSA
|
| 788 |
use_lsa = bool(use_lsa)
|
| 789 |
X_reduced = None
|
| 790 |
svd_obj = None
|
| 791 |
norm_obj = None
|
| 792 |
if use_lsa:
|
| 793 |
svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
|
| 794 |
-
X_reduced_tmp = svd_obj.fit_transform(X_full) # dense
|
| 795 |
norm_obj = Normalizer(copy=False)
|
| 796 |
X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
|
| 797 |
del X_reduced_tmp
|
| 798 |
gc.collect()
|
| 799 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 800 |
# K selection
|
| 801 |
if bool(auto_k):
|
| 802 |
if use_lsa:
|
| 803 |
k, _ = choose_k_by_kneedle(X_reduced, ks=(50,100,150,200,300,400,500))
|
| 804 |
else:
|
| 805 |
-
# fallback: heuristic rule on doc count
|
| 806 |
k = auto_k_rule(X_full.shape[0])
|
| 807 |
else:
|
| 808 |
k = max(10, int(k_clusters or 350))
|
| 809 |
|
| 810 |
-
# Optional seeded init (only in LSA space
|
| 811 |
init = None
|
| 812 |
if use_lsa:
|
| 813 |
seeds = seeded_centroids_in_lsa(
|
| 814 |
CORR_LEX, count_vec, svd_obj.components_, norm_obj,
|
| 815 |
d_word=d_word, d_full=d_full, k=k
|
| 816 |
)
|
| 817 |
-
if seeds is not None and seeds.shape[0]
|
| 818 |
-
|
| 819 |
-
# For MiniBatchKMeans, we must provide exactly k centers or fall back to k-means++.
|
| 820 |
-
# So use seeds only if seeds.shape[0] == k; otherwise None.
|
| 821 |
-
if seeds.shape[0] == k:
|
| 822 |
-
init = seeds
|
| 823 |
|
| 824 |
# KMeans clustering (use LSA space if enabled)
|
| 825 |
X_space = (X_reduced if use_lsa else X_full)
|
|
@@ -832,18 +1006,18 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 832 |
)
|
| 833 |
labels = kmeans.fit_predict(X_space)
|
| 834 |
|
| 835 |
-
#
|
| 836 |
if use_lsa:
|
| 837 |
labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.92)
|
| 838 |
|
| 839 |
df["cluster_id"] = labels
|
| 840 |
|
| 841 |
-
#
|
| 842 |
term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
|
| 843 |
df["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
|
| 844 |
|
| 845 |
-
# CorruptionScore
|
| 846 |
-
df["corruption_score"] = df.apply(corruption_score, axis=1)
|
| 847 |
|
| 848 |
# Build search index
|
| 849 |
use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
|
|
@@ -877,15 +1051,33 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 877 |
)
|
| 878 |
domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
|
| 879 |
|
| 880 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 881 |
show_df = df.copy()
|
| 882 |
if "date" in show_df.columns and show_df["date"].notna().any():
|
| 883 |
show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
|
| 884 |
else:
|
| 885 |
show_df["_dt"] = pd.NaT
|
| 886 |
show_df = show_df.sort_values(["corruption_score","_dt"], ascending=[False, False]).drop(columns=["_dt"])
|
| 887 |
-
|
| 888 |
-
cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment", "corruption_score"]
|
| 889 |
out_table = show_df[cols_out].head(500)
|
| 890 |
|
| 891 |
vec_state = {"count_vec": count_vec, "char_vec": char_vec, "bm25": bm25}
|
|
@@ -894,66 +1086,91 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 894 |
f"**Processed {len(df):,} emails** \n"
|
| 895 |
f"Word feats (BM25): {d_word:,} | Char feats: {d_char:,} | Total: {d_full:,} \n"
|
| 896 |
f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
|
| 897 |
-
f"k = {k} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'}"
|
|
|
|
| 898 |
)
|
| 899 |
|
| 900 |
gc.collect()
|
| 901 |
|
| 902 |
cluster_update = gr.update(choices=cluster_choices, value="(any)")
|
| 903 |
domain_update = gr.update(choices=domain_choices, value="(any)")
|
|
|
|
|
|
|
| 904 |
|
| 905 |
return (
|
| 906 |
status_md,
|
| 907 |
cluster_counts, domain_counts,
|
|
|
|
| 908 |
out_table,
|
| 909 |
df, vec_state, (X_reduced if use_lsa else None), index_obj, term_names,
|
| 910 |
use_lsa, bool(use_faiss),
|
| 911 |
-
cluster_update, domain_update,
|
| 912 |
svd_obj, norm_obj,
|
| 913 |
-
(d_word, d_char)
|
|
|
|
| 914 |
)
|
| 915 |
|
| 916 |
(run_btn.click)(
|
| 917 |
process_file,
|
| 918 |
inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 919 |
-
use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss
|
|
|
|
| 920 |
outputs=[status,
|
| 921 |
cluster_counts_df, domain_counts_df,
|
|
|
|
| 922 |
results_df,
|
| 923 |
state_df, state_vec, state_X_reduced, state_index, state_term_names,
|
| 924 |
state_use_lsa, state_use_faiss,
|
| 925 |
-
cluster_drop, domain_drop,
|
| 926 |
state_svd, state_norm,
|
| 927 |
-
state_dims
|
|
|
|
| 928 |
)
|
| 929 |
|
| 930 |
# -------- Filtering & Search --------
|
| 931 |
-
def
|
| 932 |
-
if df is None or len(df)
|
| 933 |
return pd.DataFrame()
|
| 934 |
-
|
| 935 |
-
|
| 936 |
-
if "date" in filt.columns and filt["date"].notna().any():
|
| 937 |
-
tmp = filt.copy()
|
| 938 |
tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
|
| 939 |
-
|
| 940 |
-
|
| 941 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 942 |
|
| 943 |
-
for ctrl in [cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]:
|
| 944 |
ctrl.change(
|
| 945 |
refresh_results,
|
| 946 |
-
inputs=[state_df, cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end],
|
| 947 |
outputs=[results_df]
|
| 948 |
)
|
| 949 |
|
| 950 |
reset_btn.click(
|
| 951 |
-
lambda: [
|
| 952 |
inputs=[],
|
| 953 |
-
outputs=[cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]
|
| 954 |
).then(
|
| 955 |
refresh_results,
|
| 956 |
-
inputs=[state_df, cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end],
|
| 957 |
outputs=[results_df]
|
| 958 |
)
|
| 959 |
|
|
@@ -986,7 +1203,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 986 |
q_full = hstack([q_word, q_char], format="csr")
|
| 987 |
return q_full
|
| 988 |
|
| 989 |
-
def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj):
|
| 990 |
if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
|
| 991 |
return pd.DataFrame(), []
|
| 992 |
q_terms = _tokenize_query(q)
|
|
@@ -1003,31 +1220,41 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1003 |
inds = indices[0]
|
| 1004 |
sims = 1.0 - distances[0]
|
| 1005 |
results = df.iloc[inds].copy()
|
| 1006 |
-
results["
|
| 1007 |
elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
|
| 1008 |
D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df)))
|
| 1009 |
inds = I[0]
|
| 1010 |
sims = D[0]
|
| 1011 |
results = df.iloc[inds].copy()
|
| 1012 |
-
results["
|
| 1013 |
else:
|
| 1014 |
return pd.DataFrame(), q_terms
|
| 1015 |
|
| 1016 |
-
|
| 1017 |
-
# Rerank by a blend: 0.7 * ANN score + 0.3 * corruption_score (scaled)
|
| 1018 |
cs = results["corruption_score"].fillna(0.0)
|
| 1019 |
cs = (cs - cs.min()) / (cs.max() - cs.min() + 1e-9)
|
| 1020 |
-
results["_blend"] = 0.7*results["
|
| 1021 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1022 |
return results[cols].head(50), q_terms
|
| 1023 |
|
| 1024 |
search_btn.click(
|
| 1025 |
search_fn,
|
| 1026 |
-
inputs=[search_query, state_df, state_vec, state_X_reduced, state_index, state_use_lsa, state_use_faiss, state_svd, state_norm],
|
| 1027 |
outputs=[results_df, state_query_terms]
|
| 1028 |
)
|
| 1029 |
|
| 1030 |
-
def on_row_select(evt: gr.SelectData, table: pd.DataFrame, df: pd.DataFrame, term_names: Dict[int, str],
|
|
|
|
| 1031 |
try:
|
| 1032 |
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
|
| 1033 |
except Exception:
|
|
@@ -1052,13 +1279,34 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1052 |
row = cand.iloc[0]
|
| 1053 |
cid = int(row.get("cluster_id", -1))
|
| 1054 |
clabel = term_names.get(cid, f"cluster_{cid}") if term_names else None
|
| 1055 |
-
return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel)
|
| 1056 |
|
| 1057 |
results_df.select(
|
| 1058 |
on_row_select,
|
| 1059 |
-
inputs=[results_df, state_df, state_term_names, state_query_terms],
|
| 1060 |
outputs=[email_view]
|
| 1061 |
)
|
| 1062 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1063 |
if __name__ == "__main__":
|
| 1064 |
demo.launch()
|
|
|
|
| 25 |
from sklearn.preprocessing import normalize as sk_normalize
|
| 26 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 27 |
|
| 28 |
+
# Optional light anomaly detection
|
| 29 |
+
try:
|
| 30 |
+
from sklearn.ensemble import IsolationForest
|
| 31 |
+
ISO_OK = True
|
| 32 |
+
except Exception:
|
| 33 |
+
ISO_OK = False
|
| 34 |
+
|
| 35 |
from scipy.sparse import hstack
|
| 36 |
|
| 37 |
# Optional fast ANN (CPU)
|
|
|
|
| 41 |
except Exception:
|
| 42 |
FAISS_OK = False
|
| 43 |
|
| 44 |
+
# Optional tiny sentiment
|
| 45 |
try:
|
| 46 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 47 |
VADER_OK = True
|
|
|
|
| 49 |
VADER_OK = False
|
| 50 |
|
| 51 |
# =================== Regex & Flags ===================
|
|
|
|
| 52 |
TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
|
|
|
|
|
|
|
| 53 |
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
|
|
|
|
|
|
|
| 54 |
QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
|
|
|
|
|
|
|
| 55 |
SIG_RE = re.compile(r"\n-- ?\n", re.M)
|
|
|
|
|
|
|
| 56 |
SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
|
| 57 |
HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
|
|
|
|
|
|
|
| 58 |
FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
|
| 59 |
FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
|
| 60 |
ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
|
| 61 |
|
|
|
|
| 62 |
SKIP_LANGDETECT = True
|
| 63 |
|
| 64 |
+
# ==== Expanded corruption lexicon ====
|
| 65 |
SUSPECT_PHRASES = [
|
| 66 |
+
# core corruption/finance
|
| 67 |
+
"off the books","cover up","kickback","bribe","under the table",
|
| 68 |
+
"no inspection","special fee","friendly payment","confidential deal",
|
| 69 |
+
"nobody will find out","pay to play","cash only","shell company",
|
| 70 |
+
"bid rigging","embezzle","slush fund","false invoice","ghost employee",
|
| 71 |
+
"contract splitting","grease payment","unreported","unrecorded",
|
| 72 |
+
# secrecy/evasion
|
| 73 |
+
"off the record","just between us","don’t quote me on this","dont quote me on this",
|
| 74 |
+
"we never had this conversation","keep this between us","not ethical","illegal",
|
| 75 |
+
"grey area","gray area","write off","failed investment","they owe it to me",
|
| 76 |
+
# off-channel comms
|
| 77 |
+
"let’s take this offline","lets take this offline","send to my gmail","send to my yahoo",
|
| 78 |
+
"don’t leave a trail","dont leave a trail","call my cell","text me","don’t text me","dont text me",
|
| 79 |
+
"tell you on the phone","talk in person","come by my office","vpn",
|
| 80 |
+
# financial secrecy & accounting games
|
| 81 |
+
"tax haven","off-shore account","offshore account","backdate","pull earnings forward",
|
| 82 |
+
"delete this email","no inspection","special fees","wire instructions",
|
| 83 |
]
|
| 84 |
+
# Evasive acronyms / slang (case-insensitive)
|
| 85 |
+
EVASIVE_ACRO_RE = re.compile(r'\b(?:TYOP|LDL|TOL|OTR|TXT|TYL)\b', re.I)
|
| 86 |
|
| 87 |
+
# Entity regexes
|
| 88 |
MONEY_RE = re.compile(r'(\$|USD|EUR|ILS|NIS)\s?\d[\d,.\s]*', re.I)
|
| 89 |
PHONE_RE = re.compile(r'(\+?\d{1,3}[-\s.]?)?(\(?\d{2,4}\)?[-\s.]?)?\d{3,4}[-\s.]?\d{4}')
|
| 90 |
+
INVOICE_RE = re.compile(r'\b(invoice|inv\.\s?\d+|po\s?#?\d+|purchase order|wire)\b', re.I)
|
| 91 |
COMPANY_RE = re.compile(r'\b(LLC|Ltd|Limited|Inc|GmbH|S\.A\.|S\.p\.A\.)\b')
|
| 92 |
+
ATTACH_NAME_RE = re.compile(r'\b(agreement|contract|invoice|wire|payment|instructions|accounts?|offshore|tax|statement)\b', re.I)
|
| 93 |
+
|
| 94 |
+
# Off-channel patterns (apps / phrases)
|
| 95 |
+
OFFCHANNEL_PATTERNS = [
|
| 96 |
+
r"\bwhatsapp\b", r"\bsignal\b", r"\btelegram\b", r"\bwechat\b",
|
| 97 |
+
r"send to my (gmail|yahoo|protonmail)", r"(call|text) (me|my cell)",
|
| 98 |
+
r"take this offline", r"don.?t (text|email) (me|this)",
|
| 99 |
+
r"\bOTR\b", r"\bTOL\b", r"\bTYOP\b", r"\bLDL\b"
|
| 100 |
+
]
|
| 101 |
+
OFFCHANNEL_RE = re.compile("|".join(OFFCHANNEL_PATTERNS), re.I)
|
| 102 |
+
|
| 103 |
+
# Common personal mail domains (used with user-specified trusted org domains)
|
| 104 |
+
PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
|
| 105 |
|
| 106 |
# Optional seeded themes for semi-supervised init (used only when LSA is ON)
|
| 107 |
CORR_LEX = {
|
|
|
|
| 111 |
"money_flow" : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
|
| 112 |
}
|
| 113 |
|
| 114 |
+
# =================== Label cleanup helpers (unchanged core) ===================
|
| 115 |
EN_STOP = {
|
| 116 |
"the","of","and","to","in","is","for","on","at","with","from","by","or","as",
|
| 117 |
"that","this","it","be","are","was","were","an","a","you","your","we","our","us",
|
| 118 |
"re","fwd","fw","hi","hello","thanks","thank","regards","best","please","dear","mr","mrs",
|
| 119 |
"message","original","forwarded","attached","attachment","confidential","notice","disclaimer",
|
| 120 |
+
"herein","thereof","hereby","therein","regarding","subject","url","via","kind","regard","ny"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
}
|
| 122 |
+
HE_STOP = {"של","על","זה","גם","אם","לא","את","אתה","אני","הוא","היא","הם","הן","כי","מה","שלום","תודה","בברכה","מצורף","הודעה","קדימה","היי"}
|
| 123 |
MONTHS = {
|
| 124 |
"jan","feb","mar","apr","may","jun","jul","aug","sep","sept","oct","nov","dec",
|
| 125 |
"january","february","march","april","june","july","august","september",
|
|
|
|
| 134 |
tl = t.lower()
|
| 135 |
if tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
|
| 136 |
return True
|
| 137 |
+
if EMAIL_LIKE_RE.search(tl): return True
|
| 138 |
+
if YEAR_RE.match(tl): return True
|
| 139 |
+
if NUMERIC_RE.match(tl): return True
|
| 140 |
+
if ONE_CHAR_RE.match(tl): return True
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
return False
|
| 142 |
|
| 143 |
def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
|
|
|
|
| 161 |
break
|
| 162 |
return cleaned
|
| 163 |
|
| 164 |
+
# =================== HTML/Text & Header Parsing ===================
|
| 165 |
def html_to_text(html: str) -> str:
|
| 166 |
if not html:
|
| 167 |
return ""
|
|
|
|
| 197 |
return (m.group(1) or "").strip(), (m.group(2) or "").strip()
|
| 198 |
return "", s.strip()
|
| 199 |
|
| 200 |
+
def parse_multi_emails(s: str) -> List[str]:
|
| 201 |
+
if not s: return []
|
| 202 |
+
parts = re.split(r",\s*(?=[^,]*@)", s)
|
| 203 |
+
emails = []
|
| 204 |
+
for p in parts:
|
| 205 |
+
_, e = parse_name_email(p.strip())
|
| 206 |
+
if e: emails.append(e)
|
| 207 |
+
return emails
|
| 208 |
+
|
| 209 |
def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
|
| 210 |
headers: Dict[str, str] = {}
|
| 211 |
lines = (text or "").splitlines()
|
| 212 |
+
header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject|Subject:|To:|Cc:|Bcc:|From:):')
|
| 213 |
i = 0
|
| 214 |
saw_header = False
|
| 215 |
while i < len(lines):
|
|
|
|
| 257 |
if str(raw.get("type", "")).lower() == "meta":
|
| 258 |
return {}
|
| 259 |
|
| 260 |
+
# attachments (names); accept common schemas
|
| 261 |
+
attach_names = []
|
| 262 |
+
atts = raw.get("attachments") or raw.get("Attachments") or raw.get("files") or []
|
| 263 |
+
if isinstance(atts, list):
|
| 264 |
+
for a in atts:
|
| 265 |
+
if isinstance(a, dict):
|
| 266 |
+
name = a.get("filename") or a.get("name") or ""
|
| 267 |
+
else:
|
| 268 |
+
name = str(a)
|
| 269 |
+
if name:
|
| 270 |
+
attach_names.append(str(name))
|
| 271 |
+
|
| 272 |
body_text_raw = raw.get("body_text") or raw.get("text") or ""
|
| 273 |
html_content = raw.get("body_html") or raw.get("html") or ""
|
| 274 |
if html_content and not body_text_raw:
|
| 275 |
body_text_raw = html_to_text(html_content)
|
|
|
|
| 276 |
body_text_raw = ftfy.fix_text(body_text_raw or "")
|
| 277 |
|
| 278 |
subject_text = ""
|
| 279 |
from_name = from_email = from_domain = ""
|
| 280 |
+
to_emails: List[str] = []
|
| 281 |
date_val = raw.get("date") or raw.get("Date") or ""
|
| 282 |
|
| 283 |
if body_text_raw:
|
|
|
|
| 285 |
subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
|
| 286 |
sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
|
| 287 |
date_val = headers.get("Date", "") or date_val
|
| 288 |
+
to_emails = parse_multi_emails(headers.get("To","") or (raw.get("to") or "")) + \
|
| 289 |
+
parse_multi_emails(headers.get("Cc","") or (raw.get("cc") or ""))
|
| 290 |
|
| 291 |
body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
|
| 292 |
body_clean = URL_RE.sub(" URL ", body_clean)
|
|
|
|
| 304 |
sender = raw.get("from") or raw.get("From") or ""
|
| 305 |
from_name, from_email = parse_name_email(sender)
|
| 306 |
from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
|
| 307 |
+
to_emails = parse_multi_emails(raw.get("to") or "") + parse_multi_emails(raw.get("cc") or "")
|
| 308 |
|
| 309 |
subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
|
| 310 |
|
|
|
|
| 340 |
"from_name": from_name,
|
| 341 |
"from_email": from_email,
|
| 342 |
"from_domain": from_domain,
|
| 343 |
+
"to_emails": to_emails,
|
| 344 |
"subject": subject_norm,
|
| 345 |
"body_text": body_text,
|
| 346 |
"lang": lang,
|
| 347 |
+
"attachments": attach_names,
|
| 348 |
"text_hash": text_hash,
|
| 349 |
}
|
| 350 |
|
|
|
|
| 362 |
if "wire" in low or "transfer" in low or "cash" in low:
|
| 363 |
if "finance" not in tags:
|
| 364 |
tags.append("finance")
|
| 365 |
+
if OFFCHANNEL_RE.search(low) or EVASIVE_ACRO_RE.search(low):
|
| 366 |
+
tags.append("off-channel")
|
| 367 |
return tags
|
| 368 |
|
| 369 |
def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
| 379 |
df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
|
| 380 |
return df
|
| 381 |
|
| 382 |
+
# Visual highlight helpers
|
| 383 |
+
def _compile_highlight_terms(row: pd.Series, extra_terms: List[str]) -> List[str]:
|
| 384 |
+
terms = []
|
| 385 |
+
txt = (row.get("subject","") + " " + row.get("body_text","")).lower()
|
| 386 |
+
# suspect phrases found in this row
|
| 387 |
+
for p in SUSPECT_PHRASES:
|
| 388 |
+
if p in txt:
|
| 389 |
+
terms.append(p)
|
| 390 |
+
# entity markers
|
| 391 |
+
if MONEY_RE.search(txt): terms.append("$")
|
| 392 |
+
if INVOICE_RE.search(txt): terms.append("invoice")
|
| 393 |
+
# regex-based (keep as literal samples)
|
| 394 |
+
if PHONE_RE.search(row.get("body_text","") or ""): terms.append("phone")
|
| 395 |
+
# extras from user input
|
| 396 |
+
for t in extra_terms or []:
|
| 397 |
+
t=t.strip()
|
| 398 |
+
if t and t.lower() in txt:
|
| 399 |
+
terms.append(t)
|
| 400 |
+
# dedupe
|
| 401 |
+
out, seen = [], set()
|
| 402 |
+
for t in terms:
|
| 403 |
+
if t.lower() not in seen:
|
| 404 |
+
out.append(t)
|
| 405 |
+
seen.add(t.lower())
|
| 406 |
+
return out[:24]
|
| 407 |
+
|
| 408 |
+
def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None,
|
| 409 |
+
cluster_label: Optional[str] = None,
|
| 410 |
+
do_highlight: bool = True,
|
| 411 |
+
extra_terms: Optional[List[str]] = None) -> str:
|
| 412 |
subject = (row.get("subject") or "").strip()
|
| 413 |
body = (row.get("body_text") or "").strip()
|
| 414 |
from_email = row.get("from_email") or ""
|
| 415 |
date = row.get("date") or ""
|
| 416 |
tags = row.get("tags") or []
|
| 417 |
+
flags = row.get("flags") or []
|
| 418 |
sentiment = row.get("sentiment") or "(unknown)"
|
| 419 |
|
| 420 |
+
hl_terms = []
|
| 421 |
+
if do_highlight:
|
| 422 |
+
hl_terms = (query_terms or []) + _compile_highlight_terms(row, extra_terms or [])
|
| 423 |
+
# make unique, case-insensitive
|
| 424 |
+
seen=set(); uniq=[]
|
| 425 |
+
for t in hl_terms:
|
| 426 |
+
tl=t.lower()
|
| 427 |
+
if tl and tl not in seen:
|
| 428 |
+
uniq.append(t); seen.add(tl)
|
| 429 |
+
hl_terms = uniq[:24]
|
| 430 |
+
|
| 431 |
def hi(text: str) -> str:
|
| 432 |
+
if not text or not do_highlight or not hl_terms:
|
| 433 |
return text
|
| 434 |
out = text
|
| 435 |
+
for qt in hl_terms:
|
| 436 |
if not qt:
|
| 437 |
continue
|
| 438 |
try:
|
|
|
|
| 449 |
dir_attr = ' dir="rtl"' if rtl else ""
|
| 450 |
body_html = body_h.replace("\n", "<br/>")
|
| 451 |
|
| 452 |
+
def pill(s, cls="tag"):
|
| 453 |
+
return f'<span class="{cls}">{s}</span>'
|
| 454 |
+
|
| 455 |
tag_html = ""
|
| 456 |
+
pills = []
|
| 457 |
if isinstance(tags, list) and tags:
|
| 458 |
+
pills += [pill(t, "tag") for t in tags]
|
| 459 |
+
if isinstance(flags, list) and flags:
|
| 460 |
+
pills += [pill(f, "tag") for f in flags]
|
| 461 |
+
if pills:
|
| 462 |
+
tag_html = " ".join(pills)
|
| 463 |
|
| 464 |
cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
|
| 465 |
|
|
|
|
| 492 |
self.avgdl_ = None
|
| 493 |
|
| 494 |
def fit(self, X):
|
|
|
|
| 495 |
N = X.shape[0]
|
|
|
|
| 496 |
df = np.bincount(X.tocsc().indices, minlength=X.shape[1]).astype(np.float64)
|
| 497 |
self.idf_ = np.log((N - df + 0.5) / (df + 0.5 + 1e-12))
|
| 498 |
dl = np.asarray(X.sum(axis=1)).ravel()
|
|
|
|
| 512 |
data[i] = (self.idf_[cols[i]] * (tf * (k1 + 1))) / (denom + 1e-12)
|
| 513 |
return X
|
| 514 |
|
|
|
|
| 515 |
def enrich_text(row: pd.Series) -> str:
|
| 516 |
subj = row.get("subject","") or ""
|
| 517 |
body = row.get("body_text","") or ""
|
|
|
|
| 521 |
if PHONE_RE.search(t): tokens.append("__HAS_PHONE__")
|
| 522 |
if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
|
| 523 |
if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
|
| 524 |
+
if OFFCHANNEL_RE.search(t): tokens.append("__OFF_CHANNEL__")
|
| 525 |
return (t + " " + " ".join(tokens)).strip()
|
| 526 |
|
| 527 |
# =================== Cluster labeling: PMI bigrams ===================
|
|
|
|
| 555 |
labels_out[c] = ", ".join(top) if top else f"cluster_{c}"
|
| 556 |
return labels_out
|
| 557 |
|
| 558 |
+
# =================== Auto-k & merge ===================
|
| 559 |
def choose_k_by_kneedle(X, ks=(50,100,150,200,300,400,500)):
|
| 560 |
n = X.shape[0]
|
| 561 |
if n > 40000:
|
|
|
|
| 579 |
return k_best, dict(zip(ks, inertias))
|
| 580 |
|
| 581 |
def auto_k_rule(n_docs: int) -> int:
|
|
|
|
| 582 |
return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
|
| 583 |
|
|
|
|
| 584 |
def merge_close_clusters(labels, centers, thresh=0.92):
|
| 585 |
centers = sk_normalize(centers)
|
| 586 |
sim = cosine_similarity(centers, centers)
|
|
|
|
| 604 |
labels2 = np.array([idmap[root[int(c)]] for c in labels], dtype=int)
|
| 605 |
return labels2
|
| 606 |
|
|
|
|
| 607 |
def seeded_centroids_in_lsa(lexicons: Dict[str, List[str]], count_vec: CountVectorizer,
|
| 608 |
lsa_components: np.ndarray, norm_obj: Normalizer,
|
| 609 |
d_word: int, d_full: int, k: int) -> Optional[np.ndarray]:
|
|
|
|
| 610 |
seeds_word = []
|
| 611 |
vocab = count_vec.vocabulary_
|
| 612 |
for _, words in lexicons.items():
|
|
|
|
| 621 |
seeds_word.append(v)
|
| 622 |
if not seeds_word:
|
| 623 |
return None
|
|
|
|
| 624 |
seeds_full = []
|
| 625 |
for v in seeds_word:
|
| 626 |
vf = np.zeros((d_full,), dtype=np.float32)
|
| 627 |
vf[:d_word] = v
|
| 628 |
seeds_full.append(vf)
|
| 629 |
+
seeds_full = np.stack(seeds_full, axis=0)
|
| 630 |
+
seeds_red = seeds_full @ lsa_components.T
|
|
|
|
| 631 |
seeds_red = norm_obj.transform(seeds_red.astype(np.float32))
|
|
|
|
|
|
|
| 632 |
if seeds_red.shape[0] >= 2 and seeds_red.shape[0] <= k:
|
| 633 |
return seeds_red
|
| 634 |
return None
|
| 635 |
|
| 636 |
+
# =================== Scoring & Flags ===================
|
| 637 |
+
def _hour_of(dt_iso: str) -> Optional[int]:
|
| 638 |
+
try:
|
| 639 |
+
if not dt_iso: return None
|
| 640 |
+
dt = pd.to_datetime(dt_iso, utc=True, errors="coerce")
|
| 641 |
+
if pd.isna(dt): return None
|
| 642 |
+
# treat UTC for lack of per-user tz; still useful as "odd hour"
|
| 643 |
+
return int(dt.hour)
|
| 644 |
+
except Exception:
|
| 645 |
+
return None
|
| 646 |
+
|
| 647 |
+
def _attachment_flags(names: List[str]) -> List[str]:
|
| 648 |
+
flags=[]
|
| 649 |
+
for n in names or []:
|
| 650 |
+
if ATTACH_NAME_RE.search(n):
|
| 651 |
+
flags.append("📎 " + n[:40])
|
| 652 |
+
return flags[:5]
|
| 653 |
+
|
| 654 |
+
def corruption_score(row, trusted_domains: set):
|
| 655 |
score = 0.0
|
| 656 |
txt = f'{row.get("subject","")} {row.get("body_text","")}'.lower()
|
| 657 |
for ph in SUSPECT_PHRASES:
|
| 658 |
if ph in txt:
|
| 659 |
score += 2.0
|
| 660 |
break
|
| 661 |
+
if EVASIVE_ACRO_RE.search(txt) or OFFCHANNEL_RE.search(txt):
|
| 662 |
+
score += 1.0
|
| 663 |
if isinstance(row.get("tags"), list) and ("🚩suspect" in row["tags"] or "finance" in row["tags"]):
|
| 664 |
score += 1.5
|
| 665 |
if MONEY_RE.search(txt): score += 0.7
|
|
|
|
| 669 |
body_len = len(row.get("body_text",""))
|
| 670 |
if body_len < 160 and PHONE_RE.search(row.get("body_text","") or ""):
|
| 671 |
score += 0.5
|
| 672 |
+
# personal/off-channel via headers
|
| 673 |
+
fd = (row.get("from_domain") or "").lower()
|
| 674 |
+
if fd in PERSONAL_DOMAINS and fd not in trusted_domains:
|
| 675 |
+
score += 0.5
|
| 676 |
+
# odd hours
|
| 677 |
+
h = _hour_of(row.get("date") or "")
|
| 678 |
+
if h is not None and (h < 6 or h > 22):
|
| 679 |
+
score += 0.3
|
| 680 |
return score
|
| 681 |
|
| 682 |
# =================== Gradio UI ===================
|
|
|
|
| 695 |
mark { background:#fff59d; color:#111827; padding:0 2px; border-radius:2px; }
|
| 696 |
hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
|
| 697 |
.small { color:#475569; font-size:12px; }
|
| 698 |
+
.cursor { cursor:pointer; }
|
| 699 |
"""
|
| 700 |
|
| 701 |
with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
|
| 702 |
+
gr.Markdown("# Email Investigator — BM25 + Char-grams + (optional) LSA → MiniBatchKMeans")
|
|
|
|
|
|
|
|
|
|
| 703 |
|
| 704 |
with gr.Row():
|
| 705 |
inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
|
|
|
|
| 719 |
mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
|
| 720 |
with gr.Row():
|
| 721 |
use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available & LSA on)", value=True)
|
| 722 |
+
use_iso = gr.Checkbox(label="Compute anomaly score (IsolationForest on LSA)", value=False)
|
| 723 |
|
| 724 |
+
with gr.Accordion("Investigation Controls", open=True):
|
| 725 |
with gr.Row():
|
| 726 |
+
trusted_domains_in = gr.Textbox(label="Trusted org domains (comma-separated)", value="example.gov, example.org")
|
| 727 |
+
extra_keywords_in = gr.Textbox(label="Extra suspicious phrases (comma-separated)", value="")
|
| 728 |
+
highlight_toggle = gr.Checkbox(label="Highlight suspect patterns in reader", value=True)
|
|
|
|
|
|
|
| 729 |
with gr.Row():
|
| 730 |
+
cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
|
| 731 |
+
domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
|
| 732 |
+
sender_drop = gr.Dropdown(label="Sender email", choices=[], value=None, allow_custom_value=False)
|
| 733 |
+
lang_drop = gr.Dropdown(label="Language", choices=["(any)"], value="(any)", allow_custom_value=False)
|
| 734 |
+
sentiment_drop = gr.Dropdown(label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)")
|
| 735 |
+
tag_drop = gr.Dropdown(label="Tag", choices=["(any)", "🚩suspect", "finance", "off-channel", "odd-hours", "personal-mail"], value="(any)")
|
| 736 |
with gr.Row():
|
| 737 |
date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
|
| 738 |
date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
|
| 739 |
+
sort_by = gr.Dropdown(label="Sort by", choices=["corruption_score","date","anomaly_score","search_score"], value="corruption_score")
|
| 740 |
+
sort_dir = gr.Dropdown(label="Order", choices=["desc","asc"], value="desc")
|
| 741 |
|
| 742 |
with gr.Row():
|
| 743 |
run_btn = gr.Button("Process", variant="primary")
|
|
|
|
| 745 |
status = gr.Markdown("")
|
| 746 |
|
| 747 |
with gr.Row():
|
| 748 |
+
cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
|
| 749 |
domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
|
| 750 |
|
| 751 |
+
with gr.Row():
|
| 752 |
+
actors_df = gr.Dataframe(label="Top actors (by degree / unique counterparts)", interactive=False, wrap=True)
|
| 753 |
+
offhours_df = gr.Dataframe(label="Off-hours & personal-mail hits", interactive=False, wrap=True)
|
| 754 |
+
|
| 755 |
gr.Markdown("### Search")
|
| 756 |
with gr.Row():
|
| 757 |
search_query = gr.Textbox(label="Search (keywords, names, etc.)")
|
|
|
|
| 760 |
email_view = gr.HTML(label="Reader")
|
| 761 |
|
| 762 |
# State
|
| 763 |
+
state_df = gr.State()
|
| 764 |
+
state_vec = gr.State()
|
| 765 |
+
state_X_reduced = gr.State()
|
| 766 |
+
state_index = gr.State()
|
| 767 |
+
state_term_names = gr.State()
|
| 768 |
+
state_query_terms = gr.State()
|
| 769 |
state_use_lsa = gr.State()
|
| 770 |
state_use_faiss = gr.State()
|
| 771 |
state_svd = gr.State()
|
| 772 |
state_norm = gr.State()
|
| 773 |
+
state_dims = gr.State()
|
| 774 |
+
state_extra_terms = gr.State()
|
| 775 |
+
state_highlight = gr.State()
|
| 776 |
|
| 777 |
# -------- IO helpers --------
|
| 778 |
def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
|
|
|
|
| 807 |
df: pd.DataFrame,
|
| 808 |
cluster: Optional[str],
|
| 809 |
domain: Optional[str],
|
| 810 |
+
sender: Optional[str],
|
| 811 |
+
lang_value: str,
|
| 812 |
sentiment: str,
|
| 813 |
tag_value: str,
|
| 814 |
start: str,
|
|
|
|
| 822 |
out = out[out["cluster_id"] == cid]
|
| 823 |
if domain and domain != "(any)":
|
| 824 |
out = out[out["from_domain"] == domain]
|
| 825 |
+
if sender and sender != "(any)":
|
| 826 |
+
out = out[out["from_email"] == sender]
|
| 827 |
+
if lang_value and lang_value != "(any)":
|
| 828 |
+
out = out[out["lang"] == lang_value]
|
| 829 |
if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
|
| 830 |
out = out[out["sentiment"].astype(str) == sentiment]
|
| 831 |
if tag_value and tag_value != "(any)":
|
| 832 |
+
out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))
|
| 833 |
+
| out["flags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
|
| 834 |
if start:
|
| 835 |
try:
|
| 836 |
dt = pd.to_datetime(start, utc=True, errors="coerce")
|
|
|
|
| 845 |
pass
|
| 846 |
return out
|
| 847 |
|
| 848 |
+
# -------- Simple social network stats --------
|
| 849 |
+
def social_stats(df: pd.DataFrame) -> pd.DataFrame:
|
| 850 |
+
# degree = unique counterparts per address (from <-> each to/cc)
|
| 851 |
+
deg = {}
|
| 852 |
+
def add_edge(a,b):
|
| 853 |
+
if not a or not b or a==b: return
|
| 854 |
+
deg.setdefault(a,set()).add(b)
|
| 855 |
+
deg.setdefault(b,set()).add(a)
|
| 856 |
+
for _, r in df.iterrows():
|
| 857 |
+
f = r.get("from_email") or ""
|
| 858 |
+
tos = r.get("to_emails") or []
|
| 859 |
+
for t in tos:
|
| 860 |
+
add_edge(f, t)
|
| 861 |
+
rows=[]
|
| 862 |
+
for addr, nbrs in deg.items():
|
| 863 |
+
rows.append({"address": addr, "degree": len(nbrs)})
|
| 864 |
+
out = pd.DataFrame(rows).sort_values("degree", ascending=False).head(50)
|
| 865 |
+
return out
|
| 866 |
+
|
| 867 |
# -------- Main pipeline --------
|
| 868 |
def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 869 |
+
use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
|
| 870 |
+
trusted_domains_in, extra_keywords_in, highlight_toggle):
|
| 871 |
if inbox_file is None:
|
| 872 |
return ("**Please upload a file.**",
|
| 873 |
+
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 874 |
+
None, None, None, None)
|
| 875 |
|
| 876 |
use_lang = not bool(skip_lang)
|
| 877 |
|
| 878 |
+
# trusted org domains
|
| 879 |
+
trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
|
| 880 |
+
extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
|
| 881 |
+
# extend SUSPECT_PHRASES runtime (no mutation of constant list)
|
| 882 |
+
extra_terms_lower = [t.lower() for t in extra_terms]
|
| 883 |
+
|
| 884 |
recs = _load_json_records(inbox_file.name)
|
| 885 |
if not recs:
|
| 886 |
return ("**No valid records found.**",
|
| 887 |
+
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 888 |
+
None, None, None, None)
|
| 889 |
|
| 890 |
# Normalize
|
| 891 |
normd = []
|
|
|
|
| 896 |
df = pd.DataFrame(normd)
|
| 897 |
if df.empty:
|
| 898 |
return ("**No usable email records after normalization.**",
|
| 899 |
+
None, None, None, None, None, None, None, None, None, None, None, None, None, None, None,
|
| 900 |
+
None, None, None, None)
|
| 901 |
|
| 902 |
# Deduplicate conservatively
|
| 903 |
df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
|
| 904 |
|
| 905 |
+
# Tags + Sentiment
|
| 906 |
df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
|
| 907 |
df = compute_sentiment_column(df)
|
| 908 |
|
| 909 |
+
# Row-level flags (off-hours, personal-mail, attachments)
|
| 910 |
+
flags = []
|
| 911 |
+
for _, row in df.iterrows():
|
| 912 |
+
f = []
|
| 913 |
+
h = _hour_of(row.get("date") or "")
|
| 914 |
+
if h is not None and (h < 6 or h > 22):
|
| 915 |
+
f.append("odd-hours")
|
| 916 |
+
fd = (row.get("from_domain") or "").lower()
|
| 917 |
+
if (fd in PERSONAL_DOMAINS) and (fd not in trusted):
|
| 918 |
+
f.append("personal-mail")
|
| 919 |
+
# attachment names of interest
|
| 920 |
+
f += _attachment_flags(row.get("attachments") or [])
|
| 921 |
+
flags.append(f)
|
| 922 |
+
df["flags"] = flags
|
| 923 |
+
|
| 924 |
# Enriched texts (adds __HAS_*__ flags)
|
| 925 |
texts = list(df.apply(enrich_text, axis=1))
|
| 926 |
|
| 927 |
+
# === Vectorization ===
|
| 928 |
ngram_range = (1, 2) if use_bigrams else (1, 1)
|
| 929 |
count_vec = CountVectorizer(
|
| 930 |
analyzer="word",
|
|
|
|
| 951 |
d_char = X_char.shape[1]
|
| 952 |
d_full = X_full.shape[1]
|
| 953 |
|
| 954 |
+
# LSA
|
| 955 |
use_lsa = bool(use_lsa)
|
| 956 |
X_reduced = None
|
| 957 |
svd_obj = None
|
| 958 |
norm_obj = None
|
| 959 |
if use_lsa:
|
| 960 |
svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
|
| 961 |
+
X_reduced_tmp = svd_obj.fit_transform(X_full) # dense
|
| 962 |
norm_obj = Normalizer(copy=False)
|
| 963 |
X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
|
| 964 |
del X_reduced_tmp
|
| 965 |
gc.collect()
|
| 966 |
|
| 967 |
+
# Optional anomaly detection (on LSA space)
|
| 968 |
+
anomaly_scores = np.full((len(df),), np.nan, dtype=np.float32)
|
| 969 |
+
if use_lsa and bool(use_iso) and ISO_OK and X_reduced is not None and X_reduced.shape[0] >= 50:
|
| 970 |
+
try:
|
| 971 |
+
iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
|
| 972 |
+
iso.fit(X_reduced)
|
| 973 |
+
# higher is less anomalous; convert to anomaly score = -score
|
| 974 |
+
anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32)
|
| 975 |
+
except Exception:
|
| 976 |
+
pass
|
| 977 |
+
df["anomaly_score"] = anomaly_scores
|
| 978 |
+
|
| 979 |
# K selection
|
| 980 |
if bool(auto_k):
|
| 981 |
if use_lsa:
|
| 982 |
k, _ = choose_k_by_kneedle(X_reduced, ks=(50,100,150,200,300,400,500))
|
| 983 |
else:
|
|
|
|
| 984 |
k = auto_k_rule(X_full.shape[0])
|
| 985 |
else:
|
| 986 |
k = max(10, int(k_clusters or 350))
|
| 987 |
|
| 988 |
+
# Optional seeded init (only in LSA space)
|
| 989 |
init = None
|
| 990 |
if use_lsa:
|
| 991 |
seeds = seeded_centroids_in_lsa(
|
| 992 |
CORR_LEX, count_vec, svd_obj.components_, norm_obj,
|
| 993 |
d_word=d_word, d_full=d_full, k=k
|
| 994 |
)
|
| 995 |
+
if seeds is not None and seeds.shape[0] == k:
|
| 996 |
+
init = seeds
|
|
|
|
|
|
|
|
|
|
|
|
|
| 997 |
|
| 998 |
# KMeans clustering (use LSA space if enabled)
|
| 999 |
X_space = (X_reduced if use_lsa else X_full)
|
|
|
|
| 1006 |
)
|
| 1007 |
labels = kmeans.fit_predict(X_space)
|
| 1008 |
|
| 1009 |
+
# Merge very-similar clusters (LSA only)
|
| 1010 |
if use_lsa:
|
| 1011 |
labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.92)
|
| 1012 |
|
| 1013 |
df["cluster_id"] = labels
|
| 1014 |
|
| 1015 |
+
# Cluster names
|
| 1016 |
term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
|
| 1017 |
df["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
|
| 1018 |
|
| 1019 |
+
# CorruptionScore (now uses trusted domains)
|
| 1020 |
+
df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
|
| 1021 |
|
| 1022 |
# Build search index
|
| 1023 |
use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
|
|
|
|
| 1051 |
)
|
| 1052 |
domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
|
| 1053 |
|
| 1054 |
+
sender_counts = (
|
| 1055 |
+
df.groupby("from_email").size()
|
| 1056 |
+
.reset_index(name="count")
|
| 1057 |
+
.sort_values("count", ascending=False)
|
| 1058 |
+
.head(200)
|
| 1059 |
+
)
|
| 1060 |
+
sender_choices = ["(any)"] + sender_counts["from_email"].tolist()
|
| 1061 |
+
|
| 1062 |
+
# Languages present
|
| 1063 |
+
langs = [l for l in sorted(df["lang"].dropna().unique()) if l and l!="unknown"]
|
| 1064 |
+
lang_choices = ["(any)"] + langs
|
| 1065 |
+
|
| 1066 |
+
# Social stats
|
| 1067 |
+
actors = social_stats(df)
|
| 1068 |
+
|
| 1069 |
+
# Off-hours & personal mail table
|
| 1070 |
+
offp = df[(df["flags"].apply(lambda xs: "odd-hours" in (xs or []))) | (df["flags"].apply(lambda xs: "personal-mail" in (xs or [])))]
|
| 1071 |
+
offhours_table = offp[["date","from_email","from_domain","subject","flags","corruption_score"]].sort_values("corruption_score", ascending=False).head(200)
|
| 1072 |
+
|
| 1073 |
+
# Default results (sorted)
|
| 1074 |
show_df = df.copy()
|
| 1075 |
if "date" in show_df.columns and show_df["date"].notna().any():
|
| 1076 |
show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
|
| 1077 |
else:
|
| 1078 |
show_df["_dt"] = pd.NaT
|
| 1079 |
show_df = show_df.sort_values(["corruption_score","_dt"], ascending=[False, False]).drop(columns=["_dt"])
|
| 1080 |
+
cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
|
|
|
|
| 1081 |
out_table = show_df[cols_out].head(500)
|
| 1082 |
|
| 1083 |
vec_state = {"count_vec": count_vec, "char_vec": char_vec, "bm25": bm25}
|
|
|
|
| 1086 |
f"**Processed {len(df):,} emails** \n"
|
| 1087 |
f"Word feats (BM25): {d_word:,} | Char feats: {d_char:,} | Total: {d_full:,} \n"
|
| 1088 |
f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
|
| 1089 |
+
f"k = {k} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'} | "
|
| 1090 |
+
f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
|
| 1091 |
)
|
| 1092 |
|
| 1093 |
gc.collect()
|
| 1094 |
|
| 1095 |
cluster_update = gr.update(choices=cluster_choices, value="(any)")
|
| 1096 |
domain_update = gr.update(choices=domain_choices, value="(any)")
|
| 1097 |
+
sender_update = gr.update(choices=sender_choices, value="(any)")
|
| 1098 |
+
lang_update = gr.update(choices=lang_choices, value="(any)")
|
| 1099 |
|
| 1100 |
return (
|
| 1101 |
status_md,
|
| 1102 |
cluster_counts, domain_counts,
|
| 1103 |
+
actors, offhours_table,
|
| 1104 |
out_table,
|
| 1105 |
df, vec_state, (X_reduced if use_lsa else None), index_obj, term_names,
|
| 1106 |
use_lsa, bool(use_faiss),
|
| 1107 |
+
cluster_update, domain_update, sender_update, lang_update,
|
| 1108 |
svd_obj, norm_obj,
|
| 1109 |
+
(d_word, d_char),
|
| 1110 |
+
extra_terms_lower, bool(highlight_toggle)
|
| 1111 |
)
|
| 1112 |
|
| 1113 |
(run_btn.click)(
|
| 1114 |
process_file,
|
| 1115 |
inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 1116 |
+
use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss, use_iso,
|
| 1117 |
+
trusted_domains_in, extra_keywords_in, highlight_toggle],
|
| 1118 |
outputs=[status,
|
| 1119 |
cluster_counts_df, domain_counts_df,
|
| 1120 |
+
actors_df, offhours_df,
|
| 1121 |
results_df,
|
| 1122 |
state_df, state_vec, state_X_reduced, state_index, state_term_names,
|
| 1123 |
state_use_lsa, state_use_faiss,
|
| 1124 |
+
cluster_drop, domain_drop, sender_drop, lang_drop,
|
| 1125 |
state_svd, state_norm,
|
| 1126 |
+
state_dims,
|
| 1127 |
+
state_extra_terms, state_highlight]
|
| 1128 |
)
|
| 1129 |
|
| 1130 |
# -------- Filtering & Search --------
|
| 1131 |
+
def _sort_results(df, by, direction):
|
| 1132 |
+
if df is None or len(df)==0:
|
| 1133 |
return pd.DataFrame()
|
| 1134 |
+
tmp = df.copy()
|
| 1135 |
+
if "date" in tmp.columns:
|
|
|
|
|
|
|
| 1136 |
tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
|
| 1137 |
+
else:
|
| 1138 |
+
tmp["_dt"] = pd.NaT
|
| 1139 |
+
by = by or "corruption_score"
|
| 1140 |
+
asc = (direction == "asc")
|
| 1141 |
+
if by == "date":
|
| 1142 |
+
tmp = tmp.sort_values(["_dt"], ascending=asc)
|
| 1143 |
+
elif by == "anomaly_score" and "anomaly_score" in tmp.columns:
|
| 1144 |
+
tmp = tmp.sort_values(["anomaly_score","_dt"], ascending=[asc, not asc])
|
| 1145 |
+
else:
|
| 1146 |
+
# corruption_score or search_score (if present)
|
| 1147 |
+
col = by if by in tmp.columns else "corruption_score"
|
| 1148 |
+
tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
|
| 1149 |
+
tmp = tmp.drop(columns=["_dt"])
|
| 1150 |
+
cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score"]
|
| 1151 |
+
acc = [c for c in cols_out if c in tmp.columns]
|
| 1152 |
+
return tmp[acc].head(500)
|
| 1153 |
+
|
| 1154 |
+
def refresh_results(df, cluster_choice, domain_choice, sender_choice, lang_choice, sentiment_choice, tag_choice, start, end, sort_by, sort_dir):
|
| 1155 |
+
if df is None or len(df) == 0:
|
| 1156 |
+
return pd.DataFrame()
|
| 1157 |
+
filt = _apply_filters(df, cluster_choice, domain_choice, sender_choice, lang_choice, sentiment_choice, tag_choice, start, end)
|
| 1158 |
+
return _sort_results(filt, sort_by, sort_dir)
|
| 1159 |
|
| 1160 |
+
for ctrl in [cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir]:
|
| 1161 |
ctrl.change(
|
| 1162 |
refresh_results,
|
| 1163 |
+
inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
|
| 1164 |
outputs=[results_df]
|
| 1165 |
)
|
| 1166 |
|
| 1167 |
reset_btn.click(
|
| 1168 |
+
lambda: ["(any)", "(any)", "(any)", "(any)", "(any)", "(any)", "", "", "corruption_score", "desc"],
|
| 1169 |
inputs=[],
|
| 1170 |
+
outputs=[cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir]
|
| 1171 |
).then(
|
| 1172 |
refresh_results,
|
| 1173 |
+
inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
|
| 1174 |
outputs=[results_df]
|
| 1175 |
)
|
| 1176 |
|
|
|
|
| 1203 |
q_full = hstack([q_word, q_char], format="csr")
|
| 1204 |
return q_full
|
| 1205 |
|
| 1206 |
+
def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
|
| 1207 |
if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
|
| 1208 |
return pd.DataFrame(), []
|
| 1209 |
q_terms = _tokenize_query(q)
|
|
|
|
| 1220 |
inds = indices[0]
|
| 1221 |
sims = 1.0 - distances[0]
|
| 1222 |
results = df.iloc[inds].copy()
|
| 1223 |
+
results["search_score"] = sims
|
| 1224 |
elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
|
| 1225 |
D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df)))
|
| 1226 |
inds = I[0]
|
| 1227 |
sims = D[0]
|
| 1228 |
results = df.iloc[inds].copy()
|
| 1229 |
+
results["search_score"] = sims
|
| 1230 |
else:
|
| 1231 |
return pd.DataFrame(), q_terms
|
| 1232 |
|
| 1233 |
+
# blend with corruption score lightly
|
|
|
|
| 1234 |
cs = results["corruption_score"].fillna(0.0)
|
| 1235 |
cs = (cs - cs.min()) / (cs.max() - cs.min() + 1e-9)
|
| 1236 |
+
results["_blend"] = 0.7*results["search_score"].values + 0.3*cs.values
|
| 1237 |
+
# sort UI-selected way
|
| 1238 |
+
if sort_by == "search_score":
|
| 1239 |
+
results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
|
| 1240 |
+
else:
|
| 1241 |
+
# use blended but keep sort_by if chosen
|
| 1242 |
+
if sort_by in results.columns:
|
| 1243 |
+
results = results.sort_values([sort_by,"_blend"], ascending=[(sort_dir=="asc"), False])
|
| 1244 |
+
else:
|
| 1245 |
+
results = results.sort_values("_blend", ascending=(sort_dir=="asc"))
|
| 1246 |
+
results = results.drop(columns=["_blend"])
|
| 1247 |
+
cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "lang", "tags", "flags", "sentiment", "corruption_score", "anomaly_score", "search_score"]
|
| 1248 |
return results[cols].head(50), q_terms
|
| 1249 |
|
| 1250 |
search_btn.click(
|
| 1251 |
search_fn,
|
| 1252 |
+
inputs=[search_query, state_df, state_vec, state_X_reduced, state_index, state_use_lsa, state_use_faiss, state_svd, state_norm, sort_by, sort_dir],
|
| 1253 |
outputs=[results_df, state_query_terms]
|
| 1254 |
)
|
| 1255 |
|
| 1256 |
+
def on_row_select(evt: gr.SelectData, table: pd.DataFrame, df: pd.DataFrame, term_names: Dict[int, str],
|
| 1257 |
+
query_terms: Optional[List[str]], extra_terms: List[str], do_highlight: bool):
|
| 1258 |
try:
|
| 1259 |
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
|
| 1260 |
except Exception:
|
|
|
|
| 1279 |
row = cand.iloc[0]
|
| 1280 |
cid = int(row.get("cluster_id", -1))
|
| 1281 |
clabel = term_names.get(cid, f"cluster_{cid}") if term_names else None
|
| 1282 |
+
return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel, do_highlight=bool(do_highlight), extra_terms=extra_terms)
|
| 1283 |
|
| 1284 |
results_df.select(
|
| 1285 |
on_row_select,
|
| 1286 |
+
inputs=[results_df, state_df, state_term_names, state_query_terms, state_extra_terms, state_highlight],
|
| 1287 |
outputs=[email_view]
|
| 1288 |
)
|
| 1289 |
|
| 1290 |
+
# Click cluster summary to filter
|
| 1291 |
+
def on_cluster_click(evt: gr.SelectData, df_sum: pd.DataFrame):
|
| 1292 |
+
try:
|
| 1293 |
+
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
|
| 1294 |
+
except Exception:
|
| 1295 |
+
row_idx = evt.index if hasattr(evt, "index") else None
|
| 1296 |
+
if row_idx is None or df_sum is None or len(df_sum)==0:
|
| 1297 |
+
return "(any)"
|
| 1298 |
+
label = df_sum.iloc[row_idx]["label"]
|
| 1299 |
+
return label if isinstance(label, str) else "(any)"
|
| 1300 |
+
|
| 1301 |
+
cluster_counts_df.select(
|
| 1302 |
+
on_cluster_click,
|
| 1303 |
+
inputs=[cluster_counts_df],
|
| 1304 |
+
outputs=[cluster_drop]
|
| 1305 |
+
).then(
|
| 1306 |
+
refresh_results,
|
| 1307 |
+
inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
|
| 1308 |
+
outputs=[results_df]
|
| 1309 |
+
)
|
| 1310 |
+
|
| 1311 |
if __name__ == "__main__":
|
| 1312 |
demo.launch()
|