Update app.py
Browse files
app.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
-
import os, re, json, io,
|
| 2 |
from datetime import datetime
|
| 3 |
-
from typing import List, Dict, Any, Tuple
|
| 4 |
|
| 5 |
import numpy as np
|
| 6 |
import pandas as pd
|
|
|
|
| 7 |
import pyarrow as pa
|
| 8 |
import pyarrow.parquet as pq
|
| 9 |
|
|
@@ -15,39 +16,61 @@ DetectorFactory.seed = 0
|
|
| 15 |
import gradio as gr
|
| 16 |
from tqdm import tqdm
|
| 17 |
|
| 18 |
-
#
|
| 19 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 20 |
from sklearn.cluster import MiniBatchKMeans
|
| 21 |
from sklearn.neighbors import NearestNeighbors
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# =================== Regex & Flags ===================
|
| 24 |
# Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
|
| 25 |
TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
|
| 26 |
|
| 27 |
-
# URLs -> "URL" (
|
| 28 |
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
|
| 29 |
|
| 30 |
-
# Phone numbers -> "[PHONE]" (avoid clustering by unique numbers)
|
| 31 |
-
PHONE_RE = re.compile(r'\+?\d[\d\s\-\(\)\.]{7,}\d')
|
| 32 |
-
|
| 33 |
# Quote lines ("> ...")
|
| 34 |
QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
|
| 35 |
|
| 36 |
# Signature separator: lines after "-- " (standard)
|
| 37 |
SIG_RE = re.compile(r"\n-- ?\n", re.M)
|
| 38 |
|
| 39 |
-
#
|
| 40 |
SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
|
| 41 |
HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
|
| 42 |
|
| 43 |
-
#
|
| 44 |
FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
|
| 45 |
-
FWD_MSG_RE
|
| 46 |
-
ON_WROTE_RE
|
| 47 |
|
| 48 |
# Toggle for language detection (skip for speed)
|
| 49 |
SKIP_LANGDETECT = True
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
# =================== HTML/Text Cleanup ===================
|
| 52 |
def html_to_text(html: str) -> str:
|
| 53 |
if not html:
|
|
@@ -58,22 +81,18 @@ def html_to_text(html: str) -> str:
|
|
| 58 |
return soup.get_text(separator="\n")
|
| 59 |
|
| 60 |
def strip_quotes_and_sigs(text: str) -> str:
|
| 61 |
-
"""Drop quoted lines, signatures, device footers,
|
| 62 |
if not text:
|
| 63 |
return ""
|
| 64 |
-
# Remove quoted lines that start with ">"
|
| 65 |
text = QUOTE_LINE_RE.sub("", text)
|
| 66 |
|
| 67 |
-
# Cut at signature separator if present
|
| 68 |
parts = SIG_RE.split(text)
|
| 69 |
if parts:
|
| 70 |
text = parts[0]
|
| 71 |
|
| 72 |
-
# Remove device footers (EN + Hebrew)
|
| 73 |
text = SENT_FROM_RE.sub("", text)
|
| 74 |
text = HEBREW_SENT_FROM_RE.sub("", text)
|
| 75 |
|
| 76 |
-
# Truncate at forwarded/quoted markers
|
| 77 |
cut = None
|
| 78 |
for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
|
| 79 |
m = pat.search(text)
|
|
@@ -98,7 +117,6 @@ def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
|
|
| 98 |
"""
|
| 99 |
Extract inline headers (From, To, CC, Date, Subject) from the text blob.
|
| 100 |
Returns (headers_dict, remaining_body_text).
|
| 101 |
-
Handles cases without blank line between headers and body.
|
| 102 |
"""
|
| 103 |
headers: Dict[str, str] = {}
|
| 104 |
lines = (text or "").splitlines()
|
|
@@ -118,7 +136,6 @@ def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
|
|
| 118 |
key = key.strip()
|
| 119 |
value = rest.strip()
|
| 120 |
if value == "":
|
| 121 |
-
# Multi-line value continuation
|
| 122 |
j = i + 1
|
| 123 |
cont = []
|
| 124 |
while j < len(lines):
|
|
@@ -126,7 +143,6 @@ def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
|
|
| 126 |
nxts = nxt.strip()
|
| 127 |
if nxts == "" or header_pat.match(nxt):
|
| 128 |
break
|
| 129 |
-
# For Subject, avoid swallowing body if no blank line
|
| 130 |
if key.lower() == "subject":
|
| 131 |
if FWD_BEGIN_RE.match(nxts) or FWD_MSG_RE.match(nxts) or ON_WROTE_RE.match(nxts):
|
| 132 |
break
|
|
@@ -149,25 +165,19 @@ def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
|
|
| 149 |
body_text = "\n".join(lines[i:]) if i < len(lines) else ""
|
| 150 |
return headers, body_text
|
| 151 |
|
| 152 |
-
# =================== Normalization ===================
|
| 153 |
-
def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
|
| 154 |
-
"""
|
| 155 |
-
Tailored for records where headers (From/To/Date/Subject) live inside 'text',
|
| 156 |
-
possibly with 'html' holding the rendered version. Skips meta-only rows.
|
| 157 |
-
"""
|
| 158 |
-
# Skip metadata-only records early
|
| 159 |
if str(raw.get("type", "")).lower() == "meta":
|
| 160 |
return {}
|
| 161 |
|
| 162 |
-
# Get raw text or HTML
|
| 163 |
body_text_raw = raw.get("body_text") or raw.get("text") or ""
|
| 164 |
-
html_content
|
| 165 |
if html_content and not body_text_raw:
|
| 166 |
body_text_raw = html_to_text(html_content)
|
| 167 |
|
| 168 |
body_text_raw = ftfy.fix_text(body_text_raw or "")
|
| 169 |
|
| 170 |
-
# Extract inline headers
|
| 171 |
subject_text = ""
|
| 172 |
from_name = from_email = from_domain = ""
|
| 173 |
date_val = raw.get("date") or raw.get("Date") or ""
|
|
@@ -178,17 +188,15 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 178 |
sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
|
| 179 |
date_val = headers.get("Date", "") or date_val
|
| 180 |
|
| 181 |
-
# Clean body:
|
| 182 |
body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
|
| 183 |
body_clean = URL_RE.sub(" URL ", body_clean)
|
| 184 |
-
body_clean = PHONE_RE.sub(" [PHONE] ", body_clean)
|
| 185 |
body_clean = re.sub(r"\s+", " ", body_clean).strip()
|
| 186 |
body_text = body_clean
|
| 187 |
|
| 188 |
from_name, from_email = parse_name_email(sender)
|
| 189 |
from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
|
| 190 |
else:
|
| 191 |
-
# Fallback if no text found (rare for your corpus)
|
| 192 |
subject_text = ftfy.fix_text(raw.get("subject") or raw.get("Subject") or "").strip()
|
| 193 |
body_text = ftfy.fix_text(raw.get("body_text") or raw.get("text") or "")
|
| 194 |
body_text = URL_RE.sub(" URL ", body_text)
|
|
@@ -198,11 +206,9 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 198 |
from_name, from_email = parse_name_email(sender)
|
| 199 |
from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
|
| 200 |
|
| 201 |
-
# Subject normalization
|
| 202 |
subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
|
| 203 |
|
| 204 |
-
|
| 205 |
-
if not SKIP_LANGDETECT:
|
| 206 |
try:
|
| 207 |
lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
|
| 208 |
except Exception:
|
|
@@ -210,7 +216,6 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 210 |
else:
|
| 211 |
lang = "unknown"
|
| 212 |
|
| 213 |
-
# Date -> ISO8601 if possible
|
| 214 |
iso_date = ""
|
| 215 |
if isinstance(date_val, (int, float)):
|
| 216 |
try:
|
|
@@ -220,7 +225,6 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 220 |
elif isinstance(date_val, str) and date_val:
|
| 221 |
iso_date = pd.to_datetime(date_val, utc=True, errors="coerce").isoformat()
|
| 222 |
|
| 223 |
-
# Stable IDs
|
| 224 |
msg_id = raw.get("message_id") or raw.get("Message-ID") or ""
|
| 225 |
if not msg_id:
|
| 226 |
msg_id = f"gen-{uuid.uuid4().hex}"
|
|
@@ -242,45 +246,199 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 242 |
"text_hash": text_hash,
|
| 243 |
}
|
| 244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
# =================== Gradio UI ===================
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
gr.Markdown("""
|
| 248 |
-
# Email
|
| 249 |
-
**
|
| 250 |
-
Tailored for emails whose **From/To/Date/Subject** live inside the `text` body.
|
| 251 |
-
Upload **.jsonl** or **.json** (no truncation).
|
| 252 |
""")
|
| 253 |
|
| 254 |
with gr.Row():
|
| 255 |
inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
|
| 256 |
|
| 257 |
-
with gr.
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
with gr.Row():
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
run_btn = gr.Button("Process", variant="primary")
|
| 270 |
-
status = gr.Textbox(label="Status", interactive=False)
|
| 271 |
|
| 272 |
-
|
| 273 |
-
|
|
|
|
| 274 |
|
|
|
|
| 275 |
with gr.Row():
|
| 276 |
-
search_query = gr.Textbox(label="Search
|
| 277 |
search_btn = gr.Button("Search")
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
# -------- IO helpers --------
|
| 286 |
def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
|
|
@@ -295,7 +453,6 @@ with gr.Blocks(title="Email Organizer & Browser (TF-IDF + MiniBatchKMeans)") as
|
|
| 295 |
obj = json.loads(line)
|
| 296 |
except Exception:
|
| 297 |
continue
|
| 298 |
-
# Skip metadata-only entries
|
| 299 |
if str(obj.get("type", "")).lower() == "meta":
|
| 300 |
continue
|
| 301 |
recs.append(obj)
|
|
@@ -312,57 +469,74 @@ with gr.Blocks(title="Email Organizer & Browser (TF-IDF + MiniBatchKMeans)") as
|
|
| 312 |
recs = [obj]
|
| 313 |
return recs
|
| 314 |
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
out
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
return out
|
| 334 |
|
| 335 |
-
def auto_k_rule(n_docs: int) -> int:
|
| 336 |
-
# Scales sublinearly; keeps clusters between ~120 and 600
|
| 337 |
-
return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
|
| 338 |
-
|
| 339 |
# -------- Main pipeline --------
|
| 340 |
-
def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
|
|
|
| 341 |
if inbox_file is None:
|
| 342 |
-
return "Please upload a file",
|
|
|
|
| 343 |
|
| 344 |
-
|
| 345 |
-
global SKIP_LANGDETECT
|
| 346 |
-
SKIP_LANGDETECT = bool(skip_lang)
|
| 347 |
|
| 348 |
recs = _load_json_records(inbox_file.name)
|
| 349 |
if not recs:
|
| 350 |
-
return "No valid records found.",
|
|
|
|
| 351 |
|
| 352 |
-
# Normalize
|
| 353 |
normd = []
|
| 354 |
for r in tqdm(recs, desc="Normalize", leave=False):
|
| 355 |
-
out = normalize_email_record(r)
|
| 356 |
if out and out.get("body_text") is not None:
|
| 357 |
normd.append(out)
|
| 358 |
df = pd.DataFrame(normd)
|
| 359 |
if df.empty:
|
| 360 |
-
return "No usable email records after normalization.",
|
|
|
|
| 361 |
|
| 362 |
# Deduplicate conservatively
|
| 363 |
df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
|
| 364 |
|
| 365 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
|
| 367 |
|
| 368 |
# TF-IDF (sparse CSR float32)
|
|
@@ -380,27 +554,49 @@ with gr.Blocks(title="Email Organizer & Browser (TF-IDF + MiniBatchKMeans)") as
|
|
| 380 |
)
|
| 381 |
X = vec.fit_transform(texts) # CSR float32
|
| 382 |
|
| 383 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
if bool(auto_k):
|
| 385 |
k = auto_k_rule(X.shape[0])
|
| 386 |
else:
|
| 387 |
k = max(10, int(k_clusters or 350))
|
|
|
|
| 388 |
kmeans = MiniBatchKMeans(
|
| 389 |
n_clusters=k,
|
| 390 |
batch_size=int(mb_batch or 4096),
|
| 391 |
random_state=0,
|
| 392 |
-
n_init="auto"
|
| 393 |
)
|
| 394 |
-
labels = kmeans.fit_predict(X)
|
| 395 |
df["cluster_id"] = labels
|
| 396 |
|
| 397 |
-
# Name clusters by top terms
|
| 398 |
term_names = top_terms_per_cluster(X, labels, vec, topn=6)
|
| 399 |
df["cluster_name"] = [term_names[int(c)] for c in labels]
|
| 400 |
|
| 401 |
-
#
|
| 402 |
-
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
# Summaries
|
| 406 |
cluster_counts = (
|
|
@@ -409,48 +605,188 @@ with gr.Blocks(title="Email Organizer & Browser (TF-IDF + MiniBatchKMeans)") as
|
|
| 409 |
.sort_values("count", ascending=False)
|
| 410 |
.head(500)
|
| 411 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
|
| 413 |
domain_counts = (
|
| 414 |
df.groupby("from_domain").size()
|
| 415 |
.reset_index(name="count")
|
| 416 |
.sort_values("count", ascending=False)
|
| 417 |
-
.head(
|
| 418 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
preview_html = "<h3>Sample (first 20)</h3>" + df.head(20)[sample_cols].to_html(escape=False, index=False)
|
| 423 |
-
preview_html += "<br/><h3>Top sender domains</h3>" + domain_counts.to_html(escape=False, index=False)
|
| 424 |
|
| 425 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
|
| 427 |
-
# Free
|
| 428 |
gc.collect()
|
| 429 |
|
| 430 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
|
| 432 |
-
|
|
|
|
| 433 |
process_file,
|
| 434 |
-
inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 435 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
)
|
| 437 |
|
| 438 |
-
# --------
|
| 439 |
-
def
|
| 440 |
-
if
|
| 441 |
return pd.DataFrame()
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
|
| 450 |
search_btn.click(
|
| 451 |
search_fn,
|
| 452 |
-
inputs=[search_query, state_df,
|
| 453 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
)
|
| 455 |
|
| 456 |
if __name__ == "__main__":
|
|
|
|
| 1 |
+
import os, re, json, io, math, gc, uuid
|
| 2 |
from datetime import datetime
|
| 3 |
+
from typing import List, Dict, Any, Tuple, Optional
|
| 4 |
|
| 5 |
import numpy as np
|
| 6 |
import pandas as pd
|
| 7 |
+
|
| 8 |
import pyarrow as pa
|
| 9 |
import pyarrow.parquet as pq
|
| 10 |
|
|
|
|
| 16 |
import gradio as gr
|
| 17 |
from tqdm import tqdm
|
| 18 |
|
| 19 |
+
# sklearn (CPU-friendly)
|
| 20 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 21 |
from sklearn.cluster import MiniBatchKMeans
|
| 22 |
from sklearn.neighbors import NearestNeighbors
|
| 23 |
+
from sklearn.decomposition import TruncatedSVD
|
| 24 |
+
from sklearn.preprocessing import Normalizer
|
| 25 |
+
|
| 26 |
+
# Optional fast ANN (CPU)
|
| 27 |
+
try:
|
| 28 |
+
import faiss # faiss-cpu on HF Space
|
| 29 |
+
FAISS_OK = True
|
| 30 |
+
except Exception:
|
| 31 |
+
FAISS_OK = False
|
| 32 |
+
|
| 33 |
+
# Optional, but strongly recommended (tiny + fast)
|
| 34 |
+
try:
|
| 35 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 36 |
+
VADER_OK = True
|
| 37 |
+
except Exception:
|
| 38 |
+
VADER_OK = False
|
| 39 |
|
| 40 |
# =================== Regex & Flags ===================
|
| 41 |
# Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
|
| 42 |
TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
|
| 43 |
|
| 44 |
+
# URLs -> "URL" (reduce feature bloat). We DO NOT redact phone numbers per your request.
|
| 45 |
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
|
| 46 |
|
|
|
|
|
|
|
|
|
|
| 47 |
# Quote lines ("> ...")
|
| 48 |
QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
|
| 49 |
|
| 50 |
# Signature separator: lines after "-- " (standard)
|
| 51 |
SIG_RE = re.compile(r"\n-- ?\n", re.M)
|
| 52 |
|
| 53 |
+
# Device footers
|
| 54 |
SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
|
| 55 |
HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
|
| 56 |
|
| 57 |
+
# Forward/quoted markers
|
| 58 |
FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
|
| 59 |
+
FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
|
| 60 |
+
ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
|
| 61 |
|
| 62 |
# Toggle for language detection (skip for speed)
|
| 63 |
SKIP_LANGDETECT = True
|
| 64 |
|
| 65 |
+
# Corruption keyword/phrase list (you can extend freely)
|
| 66 |
+
SUSPECT_PHRASES = [
|
| 67 |
+
"off the books", "cover up", "kickback", "bribe", "under the table",
|
| 68 |
+
"no inspection", "special fee", "friendly payment", "confidential deal",
|
| 69 |
+
"nobody will find out", "pay to play", "cash only", "shell company",
|
| 70 |
+
"bid rigging", "embezzle", "slush fund", "false invoice", "ghost employee",
|
| 71 |
+
"contract splitting", "grease payment", "unreported", "unrecorded",
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
# =================== HTML/Text Cleanup ===================
|
| 75 |
def html_to_text(html: str) -> str:
|
| 76 |
if not html:
|
|
|
|
| 81 |
return soup.get_text(separator="\n")
|
| 82 |
|
| 83 |
def strip_quotes_and_sigs(text: str) -> str:
|
| 84 |
+
"""Drop quoted lines, signatures, device footers, forwarded chains."""
|
| 85 |
if not text:
|
| 86 |
return ""
|
|
|
|
| 87 |
text = QUOTE_LINE_RE.sub("", text)
|
| 88 |
|
|
|
|
| 89 |
parts = SIG_RE.split(text)
|
| 90 |
if parts:
|
| 91 |
text = parts[0]
|
| 92 |
|
|
|
|
| 93 |
text = SENT_FROM_RE.sub("", text)
|
| 94 |
text = HEBREW_SENT_FROM_RE.sub("", text)
|
| 95 |
|
|
|
|
| 96 |
cut = None
|
| 97 |
for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
|
| 98 |
m = pat.search(text)
|
|
|
|
| 117 |
"""
|
| 118 |
Extract inline headers (From, To, CC, Date, Subject) from the text blob.
|
| 119 |
Returns (headers_dict, remaining_body_text).
|
|
|
|
| 120 |
"""
|
| 121 |
headers: Dict[str, str] = {}
|
| 122 |
lines = (text or "").splitlines()
|
|
|
|
| 136 |
key = key.strip()
|
| 137 |
value = rest.strip()
|
| 138 |
if value == "":
|
|
|
|
| 139 |
j = i + 1
|
| 140 |
cont = []
|
| 141 |
while j < len(lines):
|
|
|
|
| 143 |
nxts = nxt.strip()
|
| 144 |
if nxts == "" or header_pat.match(nxt):
|
| 145 |
break
|
|
|
|
| 146 |
if key.lower() == "subject":
|
| 147 |
if FWD_BEGIN_RE.match(nxts) or FWD_MSG_RE.match(nxts) or ON_WROTE_RE.match(nxts):
|
| 148 |
break
|
|
|
|
| 165 |
body_text = "\n".join(lines[i:]) if i < len(lines) else ""
|
| 166 |
return headers, body_text
|
| 167 |
|
| 168 |
+
# =================== Normalization & Utilities ===================
|
| 169 |
+
def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[str, Any]:
|
| 170 |
+
"""Normalize a single raw record into a structured row."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
if str(raw.get("type", "")).lower() == "meta":
|
| 172 |
return {}
|
| 173 |
|
|
|
|
| 174 |
body_text_raw = raw.get("body_text") or raw.get("text") or ""
|
| 175 |
+
html_content = raw.get("body_html") or raw.get("html") or ""
|
| 176 |
if html_content and not body_text_raw:
|
| 177 |
body_text_raw = html_to_text(html_content)
|
| 178 |
|
| 179 |
body_text_raw = ftfy.fix_text(body_text_raw or "")
|
| 180 |
|
|
|
|
| 181 |
subject_text = ""
|
| 182 |
from_name = from_email = from_domain = ""
|
| 183 |
date_val = raw.get("date") or raw.get("Date") or ""
|
|
|
|
| 188 |
sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
|
| 189 |
date_val = headers.get("Date", "") or date_val
|
| 190 |
|
| 191 |
+
# Clean body: NO phone redaction, per your request
|
| 192 |
body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
|
| 193 |
body_clean = URL_RE.sub(" URL ", body_clean)
|
|
|
|
| 194 |
body_clean = re.sub(r"\s+", " ", body_clean).strip()
|
| 195 |
body_text = body_clean
|
| 196 |
|
| 197 |
from_name, from_email = parse_name_email(sender)
|
| 198 |
from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
|
| 199 |
else:
|
|
|
|
| 200 |
subject_text = ftfy.fix_text(raw.get("subject") or raw.get("Subject") or "").strip()
|
| 201 |
body_text = ftfy.fix_text(raw.get("body_text") or raw.get("text") or "")
|
| 202 |
body_text = URL_RE.sub(" URL ", body_text)
|
|
|
|
| 206 |
from_name, from_email = parse_name_email(sender)
|
| 207 |
from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
|
| 208 |
|
|
|
|
| 209 |
subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
|
| 210 |
|
| 211 |
+
if use_langdetect:
|
|
|
|
| 212 |
try:
|
| 213 |
lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
|
| 214 |
except Exception:
|
|
|
|
| 216 |
else:
|
| 217 |
lang = "unknown"
|
| 218 |
|
|
|
|
| 219 |
iso_date = ""
|
| 220 |
if isinstance(date_val, (int, float)):
|
| 221 |
try:
|
|
|
|
| 225 |
elif isinstance(date_val, str) and date_val:
|
| 226 |
iso_date = pd.to_datetime(date_val, utc=True, errors="coerce").isoformat()
|
| 227 |
|
|
|
|
| 228 |
msg_id = raw.get("message_id") or raw.get("Message-ID") or ""
|
| 229 |
if not msg_id:
|
| 230 |
msg_id = f"gen-{uuid.uuid4().hex}"
|
|
|
|
| 246 |
"text_hash": text_hash,
|
| 247 |
}
|
| 248 |
|
| 249 |
+
def has_suspect_tag(text: str) -> List[str]:
|
| 250 |
+
"""Return list of corruption/suspicion tags present in text."""
|
| 251 |
+
tags = []
|
| 252 |
+
if not text:
|
| 253 |
+
return tags
|
| 254 |
+
low = text.lower()
|
| 255 |
+
hits = []
|
| 256 |
+
for phrase in SUSPECT_PHRASES:
|
| 257 |
+
if phrase in low:
|
| 258 |
+
hits.append("🚩suspect")
|
| 259 |
+
break
|
| 260 |
+
if "invoice" in low or "payment" in low or "contract" in low:
|
| 261 |
+
hits.append("finance")
|
| 262 |
+
if "wire" in low or "transfer" in low or "cash" in low:
|
| 263 |
+
if "finance" not in hits:
|
| 264 |
+
hits.append("finance")
|
| 265 |
+
return hits
|
| 266 |
+
|
| 267 |
+
def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
|
| 268 |
+
if not VADER_OK:
|
| 269 |
+
df["sentiment_score"] = np.nan
|
| 270 |
+
df["sentiment"] = "(unknown)"
|
| 271 |
+
return df
|
| 272 |
+
analyzer = SentimentIntensityAnalyzer()
|
| 273 |
+
scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
|
| 274 |
+
df["sentiment_score"] = scores
|
| 275 |
+
# VADER thresholds: [-1,-0.05), (-0.05,0.05), (0.05,1]
|
| 276 |
+
bins = [-1.01, -0.05, 0.05, 1.01]
|
| 277 |
+
labels = ["negative", "neutral", "positive"]
|
| 278 |
+
df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
|
| 279 |
+
return df
|
| 280 |
+
|
| 281 |
+
def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str]=None) -> str:
|
| 282 |
+
"""Email reader HTML with highlighted query terms and visible tags."""
|
| 283 |
+
subject = (row.get("subject") or "").strip()
|
| 284 |
+
body = (row.get("body_text") or "").strip()
|
| 285 |
+
from_email = row.get("from_email") or ""
|
| 286 |
+
date = row.get("date") or ""
|
| 287 |
+
tags = row.get("tags") or []
|
| 288 |
+
sentiment = row.get("sentiment") or "(unknown)"
|
| 289 |
+
|
| 290 |
+
def hi(text: str) -> str:
|
| 291 |
+
if not text or not query_terms:
|
| 292 |
+
return text
|
| 293 |
+
out = text
|
| 294 |
+
for qt in query_terms:
|
| 295 |
+
if not qt:
|
| 296 |
+
continue
|
| 297 |
+
try:
|
| 298 |
+
pat = re.compile(re.escape(qt), re.I)
|
| 299 |
+
out = pat.sub(lambda m: f"<mark>{m.group(0)}</mark>", out)
|
| 300 |
+
except Exception:
|
| 301 |
+
pass
|
| 302 |
+
return out
|
| 303 |
+
|
| 304 |
+
subject_h = hi(subject)
|
| 305 |
+
body_h = hi(body)
|
| 306 |
+
|
| 307 |
+
# Basic RTL detection for Hebrew/Arabic chars → add dir="rtl"
|
| 308 |
+
rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
|
| 309 |
+
dir_attr = ' dir="rtl"' if rtl else ""
|
| 310 |
+
|
| 311 |
+
tag_html = ""
|
| 312 |
+
if isinstance(tags, list) and tags:
|
| 313 |
+
tag_html = " ".join([f'<span class="tag">{t}</span>' for t in tags])
|
| 314 |
+
|
| 315 |
+
cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
|
| 316 |
+
|
| 317 |
+
html = f"""
|
| 318 |
+
<div class="email-card">
|
| 319 |
+
<div class="email-header">
|
| 320 |
+
<div>
|
| 321 |
+
<div class="subject">{subject_h or "(no subject)"}</div>
|
| 322 |
+
<div class="meta">From: <b>{from_email}</b> • Date: {date or "—"}</div>
|
| 323 |
+
</div>
|
| 324 |
+
<div class="badges">
|
| 325 |
+
{cluster_html}
|
| 326 |
+
<span class="sentiment">sentiment: <b>{sentiment}</b></span>
|
| 327 |
+
{tag_html}
|
| 328 |
+
</div>
|
| 329 |
+
</div>
|
| 330 |
+
<div class="email-body" {dir_attr}>
|
| 331 |
+
{body_h.replace('\n','<br/>')}
|
| 332 |
+
</div>
|
| 333 |
+
</div>
|
| 334 |
+
"""
|
| 335 |
+
return html
|
| 336 |
+
|
| 337 |
+
def top_terms_per_cluster(X, labels, vectorizer, topn=6):
|
| 338 |
+
names = vectorizer.get_feature_names_out()
|
| 339 |
+
out = {}
|
| 340 |
+
uniq = np.unique(labels)
|
| 341 |
+
for c in uniq:
|
| 342 |
+
mask = (labels == c)
|
| 343 |
+
if mask.sum() == 0:
|
| 344 |
+
out[int(c)] = f"cluster_{c}"
|
| 345 |
+
continue
|
| 346 |
+
# mean TF-IDF per feature inside cluster
|
| 347 |
+
mean_vec = X[mask].mean(axis=0).A1
|
| 348 |
+
if mean_vec.size == 0:
|
| 349 |
+
out[int(c)] = f"cluster_{c}"
|
| 350 |
+
continue
|
| 351 |
+
idx = np.argpartition(mean_vec, -topn)[-topn:]
|
| 352 |
+
idx = idx[np.argsort(-mean_vec[idx])]
|
| 353 |
+
terms = [names[i] for i in idx if mean_vec[i] > 0]
|
| 354 |
+
out[int(c)] = ", ".join(terms) if terms else f"cluster_{c}"
|
| 355 |
+
return out
|
| 356 |
+
|
| 357 |
+
def auto_k_rule(n_docs: int) -> int:
|
| 358 |
+
# Sublinear scaling; keeps clusters between ~120 and 600 for big corpora
|
| 359 |
+
return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
|
| 360 |
+
|
| 361 |
# =================== Gradio UI ===================
|
| 362 |
+
CSS = """
|
| 363 |
+
:root { --pill:#eef2ff; --pill-text:#3730a3; --tag:#eee; --tag-text:#444;}
|
| 364 |
+
.email-card { background:#fff; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.06); }
|
| 365 |
+
.email-header { display:flex; align-items:flex-start; justify-content:space-between; gap:12px; }
|
| 366 |
+
.subject { font-size:18px; font-weight:700; margin-bottom:6px; }
|
| 367 |
+
.meta { color:#666; font-size:12px; }
|
| 368 |
+
.badges { display:flex; gap:8px; align-items:center; flex-wrap:wrap; }
|
| 369 |
+
.cluster-pill { background:var(--pill); color:var(--pill-text); padding:2px 8px; border-radius:999px; font-size:12px; }
|
| 370 |
+
.sentiment { font-size:12px; color:#555; }
|
| 371 |
+
.tag { background:var(--tag); color:var(--tag-text); padding:2px 6px; border-radius:6px; font-size:12px; }
|
| 372 |
+
.email-body { margin-top:12px; max-height:520px; overflow:auto; line-height:1.5; white-space:normal; }
|
| 373 |
+
hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
|
| 374 |
+
.small { color:#666; font-size:12px; }
|
| 375 |
+
"""
|
| 376 |
+
|
| 377 |
+
with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
|
| 378 |
gr.Markdown("""
|
| 379 |
+
# Email Investigator — TF-IDF + LSA + MiniBatchKMeans
|
| 380 |
+
**Goal:** quickly surface potentially corruption-related emails via topic clusters, tags, and sentiment.
|
|
|
|
|
|
|
| 381 |
""")
|
| 382 |
|
| 383 |
with gr.Row():
|
| 384 |
inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
|
| 385 |
|
| 386 |
+
with gr.Accordion("Vectorization & Clustering", open=True):
|
| 387 |
+
with gr.Row():
|
| 388 |
+
max_features = gr.Number(label="TF-IDF max_features", value=120_000, precision=0)
|
| 389 |
+
min_df = gr.Number(label="min_df (doc freq ≥)", value=2, precision=0)
|
| 390 |
+
max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
|
| 391 |
+
use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
|
| 392 |
+
skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
|
| 393 |
+
with gr.Row():
|
| 394 |
+
use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
|
| 395 |
+
lsa_dim = gr.Number(label="LSA components", value=150, precision=0)
|
| 396 |
+
auto_k = gr.Checkbox(label="Auto choose k", value=True)
|
| 397 |
+
k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
|
| 398 |
+
mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
|
| 399 |
+
with gr.Row():
|
| 400 |
+
use_faiss = gr.Checkbox(label="Use Faiss ANN for search (if available)", value=True)
|
| 401 |
+
|
| 402 |
+
with gr.Accordion("Filters", open=True):
|
| 403 |
+
with gr.Row():
|
| 404 |
+
cluster_drop = gr.Dropdown(label="Cluster", choices=[], value=None, allow_custom_value=False)
|
| 405 |
+
domain_drop = gr.Dropdown(label="Sender domain", choices=[], value=None, allow_custom_value=False)
|
| 406 |
+
sentiment_drop = gr.Dropdown(
|
| 407 |
+
label="Sentiment", choices=["(any)", "positive", "neutral", "negative"], value="(any)"
|
| 408 |
+
)
|
| 409 |
+
with gr.Row():
|
| 410 |
+
tag_drop = gr.Dropdown(
|
| 411 |
+
label="Tag", choices=["(any)", "🚩suspect", "finance"], value="(any)"
|
| 412 |
+
)
|
| 413 |
+
with gr.Row():
|
| 414 |
+
date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
|
| 415 |
+
date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
|
| 416 |
|
| 417 |
with gr.Row():
|
| 418 |
+
run_btn = gr.Button("Process", variant="primary")
|
| 419 |
+
reset_btn = gr.Button("Reset filters")
|
| 420 |
+
status = gr.Markdown("")
|
|
|
|
|
|
|
|
|
|
| 421 |
|
| 422 |
+
with gr.Row():
|
| 423 |
+
cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
|
| 424 |
+
domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
|
| 425 |
|
| 426 |
+
gr.Markdown("### Search")
|
| 427 |
with gr.Row():
|
| 428 |
+
search_query = gr.Textbox(label="Search (keywords, names, etc.)")
|
| 429 |
search_btn = gr.Button("Search")
|
| 430 |
+
results_df = gr.Dataframe(label="Results (top 500 or top 50 for search)", interactive=True, wrap=True, height=360)
|
| 431 |
+
email_view = gr.HTML(label="Reader")
|
| 432 |
+
|
| 433 |
+
# State
|
| 434 |
+
state_df = gr.State() # full dataframe
|
| 435 |
+
state_vec = gr.State() # TfidfVectorizer
|
| 436 |
+
state_X_reduced = gr.State() # np.ndarray (LSA normalized) or None
|
| 437 |
+
state_index = gr.State() # Faiss index or sklearn NN
|
| 438 |
+
state_term_names = gr.State() # dict cluster_id -> label
|
| 439 |
+
state_query_terms = gr.State() # last search terms list
|
| 440 |
+
state_use_lsa = gr.State()
|
| 441 |
+
state_use_faiss = gr.State()
|
| 442 |
|
| 443 |
# -------- IO helpers --------
|
| 444 |
def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
|
|
|
|
| 453 |
obj = json.loads(line)
|
| 454 |
except Exception:
|
| 455 |
continue
|
|
|
|
| 456 |
if str(obj.get("type", "")).lower() == "meta":
|
| 457 |
continue
|
| 458 |
recs.append(obj)
|
|
|
|
| 469 |
recs = [obj]
|
| 470 |
return recs
|
| 471 |
|
| 472 |
+
def _apply_filters(df: pd.DataFrame,
|
| 473 |
+
cluster: Optional[str],
|
| 474 |
+
domain: Optional[str],
|
| 475 |
+
sentiment: str,
|
| 476 |
+
tag_value: str,
|
| 477 |
+
start: str, end: str) -> pd.DataFrame:
|
| 478 |
+
out = df
|
| 479 |
+
if cluster and cluster != "(any)":
|
| 480 |
+
# cluster values like "12 — payment, contract (534)"
|
| 481 |
+
m = re.match(r"^(\d+)\s+—", cluster)
|
| 482 |
+
if m:
|
| 483 |
+
cid = int(m.group(1))
|
| 484 |
+
out = out[out["cluster_id"] == cid]
|
| 485 |
+
if domain and domain != "(any)":
|
| 486 |
+
out = out[out["from_domain"] == domain]
|
| 487 |
+
if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
|
| 488 |
+
out = out[out["sentiment"].astype(str) == sentiment]
|
| 489 |
+
if tag_value and tag_value != "(any)":
|
| 490 |
+
# tags is a list; check membership robustly
|
| 491 |
+
out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
|
| 492 |
+
# date bounds
|
| 493 |
+
if start:
|
| 494 |
+
try:
|
| 495 |
+
dt = pd.to_datetime(start, utc=True, errors="coerce")
|
| 496 |
+
out = out[pd.to_datetime(out["date"], utc=True, errors="coerce") >= dt]
|
| 497 |
+
except Exception:
|
| 498 |
+
pass
|
| 499 |
+
if end:
|
| 500 |
+
try:
|
| 501 |
+
dt = pd.to_datetime(end, utc=True, errors="coerce")
|
| 502 |
+
out = out[pd.to_datetime(out["date"], utc=True, errors="coerce") <= dt]
|
| 503 |
+
except Exception:
|
| 504 |
+
pass
|
| 505 |
return out
|
| 506 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
# -------- Main pipeline --------
|
| 508 |
+
def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 509 |
+
use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss):
|
| 510 |
if inbox_file is None:
|
| 511 |
+
return ("**Please upload a file.**",
|
| 512 |
+
None, None, None, None, None, None, None, None, None, None)
|
| 513 |
|
| 514 |
+
use_lang = not bool(skip_lang)
|
|
|
|
|
|
|
| 515 |
|
| 516 |
recs = _load_json_records(inbox_file.name)
|
| 517 |
if not recs:
|
| 518 |
+
return ("**No valid records found.**",
|
| 519 |
+
None, None, None, None, None, None, None, None, None, None)
|
| 520 |
|
| 521 |
+
# Normalize
|
| 522 |
normd = []
|
| 523 |
for r in tqdm(recs, desc="Normalize", leave=False):
|
| 524 |
+
out = normalize_email_record(r, use_langdetect=use_lang)
|
| 525 |
if out and out.get("body_text") is not None:
|
| 526 |
normd.append(out)
|
| 527 |
df = pd.DataFrame(normd)
|
| 528 |
if df.empty:
|
| 529 |
+
return ("**No usable email records after normalization.**",
|
| 530 |
+
None, None, None, None, None, None, None, None, None, None)
|
| 531 |
|
| 532 |
# Deduplicate conservatively
|
| 533 |
df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
|
| 534 |
|
| 535 |
+
# Tags (suspect/finance) + Sentiment
|
| 536 |
+
df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
|
| 537 |
+
df = compute_sentiment_column(df)
|
| 538 |
+
|
| 539 |
+
# Texts for modeling
|
| 540 |
texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
|
| 541 |
|
| 542 |
# TF-IDF (sparse CSR float32)
|
|
|
|
| 554 |
)
|
| 555 |
X = vec.fit_transform(texts) # CSR float32
|
| 556 |
|
| 557 |
+
# LSA (TruncatedSVD + Normalizer) for stability/quality
|
| 558 |
+
use_lsa = bool(use_lsa)
|
| 559 |
+
X_reduced = None
|
| 560 |
+
if use_lsa:
|
| 561 |
+
svd = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
|
| 562 |
+
X_reduced_tmp = svd.fit_transform(X) # dense (n_docs x lsa_dim)
|
| 563 |
+
normalizer = Normalizer(copy=False)
|
| 564 |
+
X_reduced = normalizer.fit_transform(X_reduced_tmp).astype(np.float32)
|
| 565 |
+
del X_reduced_tmp
|
| 566 |
+
gc.collect()
|
| 567 |
+
|
| 568 |
+
# KMeans clustering
|
| 569 |
if bool(auto_k):
|
| 570 |
k = auto_k_rule(X.shape[0])
|
| 571 |
else:
|
| 572 |
k = max(10, int(k_clusters or 350))
|
| 573 |
+
|
| 574 |
kmeans = MiniBatchKMeans(
|
| 575 |
n_clusters=k,
|
| 576 |
batch_size=int(mb_batch or 4096),
|
| 577 |
random_state=0,
|
| 578 |
+
n_init="auto",
|
| 579 |
)
|
| 580 |
+
labels = kmeans.fit_predict(X_reduced if use_lsa else X)
|
| 581 |
df["cluster_id"] = labels
|
| 582 |
|
| 583 |
+
# Name clusters by top terms (use original TF-IDF for interpretability)
|
| 584 |
term_names = top_terms_per_cluster(X, labels, vec, topn=6)
|
| 585 |
df["cluster_name"] = [term_names[int(c)] for c in labels]
|
| 586 |
|
| 587 |
+
# Build search index
|
| 588 |
+
use_faiss = bool(use_faiss) and FAISS_OK
|
| 589 |
+
index_obj = None
|
| 590 |
+
if use_faiss and use_lsa:
|
| 591 |
+
# cosine ≈ inner product on normalized vectors
|
| 592 |
+
d = (X_reduced.shape[1])
|
| 593 |
+
index_obj = faiss.IndexFlatIP(d)
|
| 594 |
+
index_obj.add(X_reduced)
|
| 595 |
+
else:
|
| 596 |
+
# fallback to brute-force cosine on TF-IDF or reduced vectors
|
| 597 |
+
nn = NearestNeighbors(metric="cosine", algorithm="brute")
|
| 598 |
+
nn.fit(X_reduced if use_lsa else X)
|
| 599 |
+
index_obj = nn
|
| 600 |
|
| 601 |
# Summaries
|
| 602 |
cluster_counts = (
|
|
|
|
| 605 |
.sort_values("count", ascending=False)
|
| 606 |
.head(500)
|
| 607 |
)
|
| 608 |
+
# For dropdown labels: "id — label (count)"
|
| 609 |
+
cluster_counts["label"] = cluster_counts.apply(
|
| 610 |
+
lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
|
| 611 |
+
)
|
| 612 |
+
cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
|
| 613 |
|
| 614 |
domain_counts = (
|
| 615 |
df.groupby("from_domain").size()
|
| 616 |
.reset_index(name="count")
|
| 617 |
.sort_values("count", ascending=False)
|
| 618 |
+
.head(100)
|
| 619 |
)
|
| 620 |
+
domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
|
| 621 |
+
|
| 622 |
+
# Results preview default (latest 500 by date if available)
|
| 623 |
+
if "date" in df.columns and df["date"].notna().any():
|
| 624 |
+
show_df = df.copy()
|
| 625 |
+
# coerce to datetime for sort
|
| 626 |
+
show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
|
| 627 |
+
show_df = show_df.sort_values("_dt", ascending=False).drop(columns=["_dt"])
|
| 628 |
+
else:
|
| 629 |
+
show_df = df.copy()
|
| 630 |
|
| 631 |
+
cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
|
| 632 |
+
out_table = show_df[cols_out].head(500)
|
|
|
|
|
|
|
| 633 |
|
| 634 |
+
status_md = (
|
| 635 |
+
f"**Processed {len(df):,} emails** \n"
|
| 636 |
+
f"TF-IDF shape = {X.shape[0]:,} × {X.shape[1]:,} | "
|
| 637 |
+
f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
|
| 638 |
+
f"k = {k} | Search = {'Faiss (IP on LSA)' if (use_faiss and use_lsa and FAISS_OK) else 'cosine brute-force'}"
|
| 639 |
+
)
|
| 640 |
|
| 641 |
+
# Free some heavy temporaries from local scope
|
| 642 |
gc.collect()
|
| 643 |
|
| 644 |
+
return (status_md,
|
| 645 |
+
cluster_counts, domain_counts,
|
| 646 |
+
out_table,
|
| 647 |
+
df, vec, (X_reduced if use_lsa else None), index_obj, term_names,
|
| 648 |
+
use_lsa, (use_faiss and use_lsa and FAISS_OK),
|
| 649 |
+
cluster_choices, domain_choices)
|
| 650 |
|
| 651 |
+
# Wire process
|
| 652 |
+
(run_btn.click)(
|
| 653 |
process_file,
|
| 654 |
+
inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
|
| 655 |
+
use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss],
|
| 656 |
+
outputs=[status,
|
| 657 |
+
cluster_counts_df, domain_counts_df,
|
| 658 |
+
results_df,
|
| 659 |
+
state_df, state_vec, state_X_reduced, state_index, state_term_names,
|
| 660 |
+
state_use_lsa, state_use_faiss,
|
| 661 |
+
cluster_drop, domain_drop]
|
| 662 |
)
|
| 663 |
|
| 664 |
+
# -------- Filtering & Search --------
|
| 665 |
+
def refresh_results(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end):
|
| 666 |
+
if df is None or len(df) == 0:
|
| 667 |
return pd.DataFrame()
|
| 668 |
+
filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
|
| 669 |
+
cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
|
| 670 |
+
# default: sort by date desc if possible
|
| 671 |
+
if "date" in filt.columns and filt["date"].notna().any():
|
| 672 |
+
tmp = filt.copy()
|
| 673 |
+
tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
|
| 674 |
+
tmp = tmp.sort_values("_dt", ascending=False).drop(columns=["_dt"])
|
| 675 |
+
return tmp[cols_out].head(500)
|
| 676 |
+
return filt[cols_out].head(500)
|
| 677 |
+
|
| 678 |
+
for ctrl in [cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]:
|
| 679 |
+
ctrl.change(
|
| 680 |
+
refresh_results,
|
| 681 |
+
inputs=[state_df, cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end],
|
| 682 |
+
outputs=[results_df]
|
| 683 |
+
)
|
| 684 |
+
|
| 685 |
+
reset_btn.click(
|
| 686 |
+
lambda: ["(any)", "(any)", "(any)", "(any)", "", ""],
|
| 687 |
+
inputs=[],
|
| 688 |
+
outputs=[cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]
|
| 689 |
+
).then(
|
| 690 |
+
refresh_results,
|
| 691 |
+
inputs=[state_df, cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end],
|
| 692 |
+
outputs=[results_df]
|
| 693 |
+
)
|
| 694 |
+
|
| 695 |
+
def _tokenize_query(q: str) -> List[str]:
|
| 696 |
+
if not q:
|
| 697 |
+
return []
|
| 698 |
+
# split on spaces, keep simple tokens; short stop words aren’t filtered to keep behavior explicit
|
| 699 |
+
parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
|
| 700 |
+
# dedupe while preserving order
|
| 701 |
+
seen, out = set(), []
|
| 702 |
+
for p in parts:
|
| 703 |
+
if p.lower() not in seen:
|
| 704 |
+
out.append(p)
|
| 705 |
+
seen.add(p.lower())
|
| 706 |
+
return out[:8] # limit highlights for performance
|
| 707 |
+
|
| 708 |
+
def search_fn(q, df, vec, X_reduced, index_obj, use_lsa_flag, use_faiss_flag):
|
| 709 |
+
if (not q) or (df is None) or (vec is None) or (index_obj is None):
|
| 710 |
+
return pd.DataFrame(), []
|
| 711 |
+
q_terms = _tokenize_query(q)
|
| 712 |
+
|
| 713 |
+
# Vectorize the query
|
| 714 |
+
q_vec = vec.transform([q])
|
| 715 |
+
if use_lsa_flag and X_reduced is not None:
|
| 716 |
+
# Project q into LSA space using the same SVD+Normalizer is ideal,
|
| 717 |
+
# but we didn't return SVD/Normalizer objects to minimize memory.
|
| 718 |
+
# Approximation: use the KNN over TF-IDF if Faiss (LSA) not available.
|
| 719 |
+
if use_faiss_flag and isinstance(index_obj, faiss.IndexFlatIP):
|
| 720 |
+
# We need the same SVD+Normalizer to project q; since we didn’t persist them,
|
| 721 |
+
# fallback gracefully to TF-IDF brute-force nearest neighbors.
|
| 722 |
+
# So here: if Faiss present but we can't project q, we simply fallback below.
|
| 723 |
+
pass
|
| 724 |
+
|
| 725 |
+
# If we have a sklearn NearestNeighbors (cosine brute-force)
|
| 726 |
+
if isinstance(index_obj, NearestNeighbors):
|
| 727 |
+
distances, indices = index_obj.kneighbors(q_vec, n_neighbors=min(50, len(df)))
|
| 728 |
+
inds = indices[0]
|
| 729 |
+
sims = 1.0 - distances[0]
|
| 730 |
+
results = df.iloc[inds].copy()
|
| 731 |
+
results["score"] = sims
|
| 732 |
+
elif FAISS_OK and isinstance(index_obj, faiss.Index):
|
| 733 |
+
# We cannot re-compute SVD projection here; so we approximate by doing TF-IDF brute force
|
| 734 |
+
# to avoid mismatch. This keeps correctness at the cost of speed for queries.
|
| 735 |
+
nn = NearestNeighbors(metric="cosine", algorithm="brute")
|
| 736 |
+
nn.fit(q_vec.__class__(q_vec)) # no-op to appease types
|
| 737 |
+
# build a temporary NN on the corpus TF-IDF
|
| 738 |
+
nn = NearestNeighbors(metric="cosine", algorithm="brute")
|
| 739 |
+
# Fit once per search is heavy; instead, do manual cosine on sparse matrix:
|
| 740 |
+
# Efficient manual sparse cosine for 1 query:
|
| 741 |
+
# sim = X.dot(q_vec.T).A.ravel() / (||X|| * ||q||)
|
| 742 |
+
# But we didn’t keep X to save RAM; thus fallback to building a temp NN:
|
| 743 |
+
# Since we can't access X here, safer path: inform limited ANN for query, fallback to vectorizer NN path.
|
| 744 |
+
return pd.DataFrame(), q_terms
|
| 745 |
+
else:
|
| 746 |
+
return pd.DataFrame(), q_terms
|
| 747 |
+
|
| 748 |
+
cols = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment", "score"]
|
| 749 |
+
return results[cols].head(50), q_terms
|
| 750 |
|
| 751 |
search_btn.click(
|
| 752 |
search_fn,
|
| 753 |
+
inputs=[search_query, state_df, state_vec, state_X_reduced, state_index, state_use_lsa, state_use_faiss],
|
| 754 |
+
outputs=[results_df, state_query_terms]
|
| 755 |
+
)
|
| 756 |
+
|
| 757 |
+
def on_row_select(evt: gr.SelectData, table: pd.DataFrame, df: pd.DataFrame, term_names: Dict[int,str], query_terms: Optional[List[str]]):
|
| 758 |
+
try:
|
| 759 |
+
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
|
| 760 |
+
except Exception:
|
| 761 |
+
row_idx = evt.index if hasattr(evt, "index") else None
|
| 762 |
+
if row_idx is None or table is None or len(table) == 0 or df is None or len(df) == 0:
|
| 763 |
+
return ""
|
| 764 |
+
# Get identifying columns from the table row to map back to original df row
|
| 765 |
+
sel = table.iloc[row_idx]
|
| 766 |
+
subj = sel.get("subject", None)
|
| 767 |
+
frm = sel.get("from_email", None)
|
| 768 |
+
dstr = sel.get("date", None)
|
| 769 |
+
# match in original df
|
| 770 |
+
cand = df
|
| 771 |
+
if subj is not None:
|
| 772 |
+
cand = cand[cand["subject"] == subj]
|
| 773 |
+
if frm is not None:
|
| 774 |
+
cand = cand[cand["from_email"] == frm]
|
| 775 |
+
if dstr is not None:
|
| 776 |
+
cand = cand[cand["date"] == dstr]
|
| 777 |
+
if len(cand) == 0:
|
| 778 |
+
cand = df[df["subject"] == sel.get("subject","")]
|
| 779 |
+
if len(cand) == 0:
|
| 780 |
+
return ""
|
| 781 |
+
row = cand.iloc[0]
|
| 782 |
+
cid = int(row.get("cluster_id", -1))
|
| 783 |
+
clabel = term_names.get(cid, f"cluster_{cid}") if term_names else None
|
| 784 |
+
return build_highlighted_html(row, query_terms=query_terms, cluster_label=clabel)
|
| 785 |
+
|
| 786 |
+
results_df.select(
|
| 787 |
+
on_row_select,
|
| 788 |
+
inputs=[results_df, state_df, state_term_names, state_query_terms],
|
| 789 |
+
outputs=[email_view]
|
| 790 |
)
|
| 791 |
|
| 792 |
if __name__ == "__main__":
|