wuhp commited on
Commit
8336b56
·
verified ·
1 Parent(s): 2ebeb60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -99
app.py CHANGED
@@ -38,31 +38,17 @@ except Exception:
38
  VADER_OK = False
39
 
40
  # =================== Regex & Flags ===================
41
- # Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
42
  TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
43
-
44
- # URLs -> "URL" (reduce feature bloat). We DO NOT redact phone numbers per your request.
45
  URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
46
-
47
- # Quote lines ("> ...")
48
  QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
49
-
50
- # Signature separator: lines after "-- " (standard)
51
  SIG_RE = re.compile(r"\n-- ?\n", re.M)
52
-
53
- # Device footers
54
  SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
55
  HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
56
-
57
- # Forward/quoted markers
58
  FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
59
  FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
60
  ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
61
-
62
- # Toggle for language detection (skip for speed)
63
  SKIP_LANGDETECT = True
64
 
65
- # Corruption keyword/phrase list (you can extend freely)
66
  SUSPECT_PHRASES = [
67
  "off the books", "cover up", "kickback", "bribe", "under the table",
68
  "no inspection", "special fee", "friendly payment", "confidential deal",
@@ -81,18 +67,14 @@ def html_to_text(html: str) -> str:
81
  return soup.get_text(separator="\n")
82
 
83
  def strip_quotes_and_sigs(text: str) -> str:
84
- """Drop quoted lines, signatures, device footers, forwarded chains."""
85
  if not text:
86
  return ""
87
  text = QUOTE_LINE_RE.sub("", text)
88
-
89
  parts = SIG_RE.split(text)
90
  if parts:
91
  text = parts[0]
92
-
93
  text = SENT_FROM_RE.sub("", text)
94
  text = HEBREW_SENT_FROM_RE.sub("", text)
95
-
96
  cut = None
97
  for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
98
  m = pat.search(text)
@@ -101,11 +83,9 @@ def strip_quotes_and_sigs(text: str) -> str:
101
  cut = idx if (cut is None or idx < cut) else cut
102
  if cut is not None:
103
  text = text[:cut]
104
-
105
  return text.strip()
106
 
107
  def parse_name_email(s: str) -> Tuple[str, str]:
108
- """Split 'Name <email>' into (name, email)."""
109
  if not s:
110
  return "", ""
111
  m = re.match(r'(?:"?([^"]*)"?\s)?<?([^<>]+@[^<>]+)>?', s)
@@ -114,16 +94,11 @@ def parse_name_email(s: str) -> Tuple[str, str]:
114
  return "", s.strip()
115
 
116
  def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
117
- """
118
- Extract inline headers (From, To, CC, Date, Subject) from the text blob.
119
- Returns (headers_dict, remaining_body_text).
120
- """
121
  headers: Dict[str, str] = {}
122
  lines = (text or "").splitlines()
123
  header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject):')
124
  i = 0
125
  saw_header = False
126
-
127
  while i < len(lines):
128
  line = lines[i].rstrip("\r")
129
  stripped = line.strip()
@@ -161,39 +136,30 @@ def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
161
  break
162
  else:
163
  break
164
-
165
  body_text = "\n".join(lines[i:]) if i < len(lines) else ""
166
  return headers, body_text
167
 
168
  # =================== Normalization & Utilities ===================
169
  def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[str, Any]:
170
- """Normalize a single raw record into a structured row."""
171
  if str(raw.get("type", "")).lower() == "meta":
172
  return {}
173
-
174
  body_text_raw = raw.get("body_text") or raw.get("text") or ""
175
  html_content = raw.get("body_html") or raw.get("html") or ""
176
  if html_content and not body_text_raw:
177
  body_text_raw = html_to_text(html_content)
178
-
179
  body_text_raw = ftfy.fix_text(body_text_raw or "")
180
-
181
  subject_text = ""
182
  from_name = from_email = from_domain = ""
183
  date_val = raw.get("date") or raw.get("Date") or ""
184
-
185
  if body_text_raw:
186
  headers, body_only = parse_email_headers(body_text_raw)
187
  subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
188
  sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
189
  date_val = headers.get("Date", "") or date_val
190
-
191
- # Clean body: NO phone redaction, per your request
192
  body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
193
  body_clean = URL_RE.sub(" URL ", body_clean)
194
  body_clean = re.sub(r"\s+", " ", body_clean).strip()
195
  body_text = body_clean
196
-
197
  from_name, from_email = parse_name_email(sender)
198
  from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
199
  else:
@@ -205,9 +171,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
205
  sender = raw.get("from") or raw.get("From") or ""
206
  from_name, from_email = parse_name_email(sender)
207
  from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
208
-
209
  subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
210
-
211
  if use_langdetect:
212
  try:
213
  lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
@@ -215,7 +179,6 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
215
  lang = "unknown"
216
  else:
217
  lang = "unknown"
218
-
219
  iso_date = ""
220
  if isinstance(date_val, (int, float)):
221
  try:
@@ -224,15 +187,12 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
224
  iso_date = ""
225
  elif isinstance(date_val, str) and date_val:
226
  iso_date = pd.to_datetime(date_val, utc=True, errors="coerce").isoformat()
227
-
228
  msg_id = raw.get("message_id") or raw.get("Message-ID") or ""
229
  if not msg_id:
230
  msg_id = f"gen-{uuid.uuid4().hex}"
231
-
232
  thread_key = subject_norm or (from_email + body_text[:120])
233
  thread_id = str(pd.util.hash_pandas_object(pd.Series([thread_key], dtype="string")).astype("uint64").iloc[0])
234
  text_hash = str(pd.util.hash_pandas_object(pd.Series([body_text], dtype="string")).astype("uint64").iloc[0]) if body_text else ""
235
-
236
  return {
237
  "message_id": str(msg_id),
238
  "thread_id": thread_id,
@@ -247,22 +207,20 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
247
  }
248
 
249
  def has_suspect_tag(text: str) -> List[str]:
250
- """Return list of corruption/suspicion tags present in text."""
251
  tags = []
252
  if not text:
253
  return tags
254
  low = text.lower()
255
- hits = []
256
  for phrase in SUSPECT_PHRASES:
257
  if phrase in low:
258
- hits.append("🚩suspect")
259
  break
260
  if "invoice" in low or "payment" in low or "contract" in low:
261
- hits.append("finance")
262
  if "wire" in low or "transfer" in low or "cash" in low:
263
- if "finance" not in hits:
264
- hits.append("finance")
265
- return hits
266
 
267
  def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
268
  if not VADER_OK:
@@ -272,14 +230,12 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
272
  analyzer = SentimentIntensityAnalyzer()
273
  scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
274
  df["sentiment_score"] = scores
275
- # VADER thresholds: [-1,-0.05), (-0.05,0.05), (0.05,1]
276
  bins = [-1.01, -0.05, 0.05, 1.01]
277
  labels = ["negative", "neutral", "positive"]
278
  df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
279
  return df
280
 
281
  def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str]=None) -> str:
282
- """Email reader HTML with highlighted query terms and visible tags."""
283
  subject = (row.get("subject") or "").strip()
284
  body = (row.get("body_text") or "").strip()
285
  from_email = row.get("from_email") or ""
@@ -303,12 +259,8 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
303
 
304
  subject_h = hi(subject)
305
  body_h = hi(body)
306
-
307
- # Basic RTL detection for Hebrew/Arabic chars → add dir="rtl"
308
  rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
309
  dir_attr = ' dir="rtl"' if rtl else ""
310
-
311
- # PRECOMPUTE to avoid backslashes inside f-string expressions
312
  body_html = body_h.replace("\n", "<br/>")
313
 
314
  tag_html = ""
@@ -346,7 +298,6 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
346
  if mask.sum() == 0:
347
  out[int(c)] = f"cluster_{c}"
348
  continue
349
- # mean TF-IDF per feature inside cluster
350
  mean_vec = X[mask].mean(axis=0).A1
351
  if mean_vec.size == 0:
352
  out[int(c)] = f"cluster_{c}"
@@ -358,7 +309,6 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
358
  return out
359
 
360
  def auto_k_rule(n_docs: int) -> int:
361
- # Sublinear scaling; keeps clusters between ~120 and 600 for big corpora
362
  return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
363
 
364
  # =================== Gradio UI ===================
@@ -430,16 +380,17 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
430
  with gr.Row():
431
  search_query = gr.Textbox(label="Search (keywords, names, etc.)")
432
  search_btn = gr.Button("Search")
433
- results_df = gr.Dataframe(label="Results (top 500 or top 50 for search)", interactive=True, wrap=True, height=360)
 
434
  email_view = gr.HTML(label="Reader")
435
 
436
  # State
437
- state_df = gr.State() # full dataframe
438
- state_vec = gr.State() # TfidfVectorizer
439
- state_X_reduced = gr.State() # np.ndarray (LSA normalized) or None
440
- state_index = gr.State() # Faiss index or sklearn NN
441
- state_term_names = gr.State() # dict cluster_id -> label
442
- state_query_terms = gr.State() # last search terms list
443
  state_use_lsa = gr.State()
444
  state_use_faiss = gr.State()
445
 
@@ -480,7 +431,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
480
  start: str, end: str) -> pd.DataFrame:
481
  out = df
482
  if cluster and cluster != "(any)":
483
- # cluster values like "12 — payment, contract (534)"
484
  m = re.match(r"^(\d+)\s+—", cluster)
485
  if m:
486
  cid = int(m.group(1))
@@ -490,9 +440,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
490
  if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
491
  out = out[out["sentiment"].astype(str) == sentiment]
492
  if tag_value and tag_value != "(any)":
493
- # tags is a list; check membership robustly
494
  out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
495
- # date bounds
496
  if start:
497
  try:
498
  dt = pd.to_datetime(start, utc=True, errors="coerce")
@@ -521,7 +469,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
521
  return ("**No valid records found.**",
522
  None, None, None, None, None, None, None, None, None, None)
523
 
524
- # Normalize
525
  normd = []
526
  for r in tqdm(recs, desc="Normalize", leave=False):
527
  out = normalize_email_record(r, use_langdetect=use_lang)
@@ -532,17 +479,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
532
  return ("**No usable email records after normalization.**",
533
  None, None, None, None, None, None, None, None, None, None)
534
 
535
- # Deduplicate conservatively
536
  df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
537
 
538
- # Tags (suspect/finance) + Sentiment
539
  df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
540
  df = compute_sentiment_column(df)
541
 
542
- # Texts for modeling
543
  texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
544
 
545
- # TF-IDF (sparse CSR float32)
546
  ngram_range = (1, 2) if use_bigrams else (1, 1)
547
  vec = TfidfVectorizer(
548
  analyzer="word",
@@ -555,20 +498,18 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
555
  sublinear_tf=True,
556
  dtype=np.float32,
557
  )
558
- X = vec.fit_transform(texts) # CSR float32
559
 
560
- # LSA (TruncatedSVD + Normalizer) for stability/quality
561
  use_lsa = bool(use_lsa)
562
  X_reduced = None
563
  if use_lsa:
564
  svd = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
565
- X_reduced_tmp = svd.fit_transform(X) # dense (n_docs x lsa_dim)
566
  normalizer = Normalizer(copy=False)
567
  X_reduced = normalizer.fit_transform(X_reduced_tmp).astype(np.float32)
568
  del X_reduced_tmp
569
  gc.collect()
570
 
571
- # KMeans clustering
572
  if bool(auto_k):
573
  k = auto_k_rule(X.shape[0])
574
  else:
@@ -583,32 +524,26 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
583
  labels = kmeans.fit_predict(X_reduced if use_lsa else X)
584
  df["cluster_id"] = labels
585
 
586
- # Name clusters by top terms (use original TF-IDF for interpretability)
587
  term_names = top_terms_per_cluster(X, labels, vec, topn=6)
588
  df["cluster_name"] = [term_names[int(c)] for c in labels]
589
 
590
- # Build search index
591
  use_faiss = bool(use_faiss) and FAISS_OK
592
  index_obj = None
593
  if use_faiss and use_lsa:
594
- # cosine ≈ inner product on normalized vectors
595
  d = (X_reduced.shape[1])
596
  index_obj = faiss.IndexFlatIP(d)
597
  index_obj.add(X_reduced)
598
  else:
599
- # fallback to brute-force cosine on TF-IDF or reduced vectors
600
  nn = NearestNeighbors(metric="cosine", algorithm="brute")
601
  nn.fit(X_reduced if use_lsa else X)
602
  index_obj = nn
603
 
604
- # Summaries
605
  cluster_counts = (
606
  df.groupby(["cluster_id", "cluster_name"]).size()
607
  .reset_index(name="count")
608
  .sort_values("count", ascending=False)
609
  .head(500)
610
  )
611
- # For dropdown labels: "id — label (count)"
612
  cluster_counts["label"] = cluster_counts.apply(
613
  lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
614
  )
@@ -622,10 +557,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
622
  )
623
  domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
624
 
625
- # Results preview default (latest 500 by date if available)
626
  if "date" in df.columns and df["date"].notna().any():
627
  show_df = df.copy()
628
- # coerce to datetime for sort
629
  show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
630
  show_df = show_df.sort_values("_dt", ascending=False).drop(columns=["_dt"])
631
  else:
@@ -641,7 +574,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
641
  f"k = {k} | Search = {'Faiss (IP on LSA)' if (use_faiss and use_lsa and FAISS_OK) else 'cosine brute-force'}"
642
  )
643
 
644
- # Free some heavy temporaries from local scope
645
  gc.collect()
646
 
647
  return (status_md,
@@ -651,7 +583,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
651
  use_lsa, (use_faiss and use_lsa and FAISS_OK),
652
  cluster_choices, domain_choices)
653
 
654
- # Wire process
655
  (run_btn.click)(
656
  process_file,
657
  inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
@@ -670,7 +601,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
670
  return pd.DataFrame()
671
  filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
672
  cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
673
- # default: sort by date desc if possible
674
  if "date" in filt.columns and filt["date"].notna().any():
675
  tmp = filt.copy()
676
  tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
@@ -698,29 +628,20 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
698
  def _tokenize_query(q: str) -> List[str]:
699
  if not q:
700
  return []
701
- # split on spaces, keep simple tokens; short stop words aren’t filtered to keep behavior explicit
702
  parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
703
- # dedupe while preserving order
704
  seen, out = set(), []
705
  for p in parts:
706
  if p.lower() not in seen:
707
  out.append(p)
708
  seen.add(p.lower())
709
- return out[:8] # limit highlights for performance
710
 
711
  def search_fn(q, df, vec, X_reduced, index_obj, use_lsa_flag, use_faiss_flag):
712
  if (not q) or (df is None) or (vec is None) or (index_obj is None):
713
  return pd.DataFrame(), []
714
  q_terms = _tokenize_query(q)
715
-
716
- # Vectorize the query
717
  q_vec = vec.transform([q])
718
- if use_lsa_flag and X_reduced is not None:
719
- # Ideally, project q with the same SVD+Normalizer; since we didn't persist them,
720
- # we fall back to the TF-IDF brute-force path below.
721
- pass
722
 
723
- # If we have a sklearn NearestNeighbors (cosine brute-force)
724
  if isinstance(index_obj, NearestNeighbors):
725
  distances, indices = index_obj.kneighbors(q_vec, n_neighbors=min(50, len(df)))
726
  inds = indices[0]
@@ -728,8 +649,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
728
  results = df.iloc[inds].copy()
729
  results["score"] = sims
730
  elif FAISS_OK and isinstance(index_obj, faiss.Index):
731
- # Can't project the query into LSA here without the SVD/Normalizer objects;
732
- # so skip ANN for ad-hoc queries and return no results (or implement a TF-IDF fallback if you keep X).
733
  return pd.DataFrame(), q_terms
734
  else:
735
  return pd.DataFrame(), q_terms
@@ -750,12 +670,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
750
  row_idx = evt.index if hasattr(evt, "index") else None
751
  if row_idx is None or table is None or len(table) == 0 or df is None or len(df) == 0:
752
  return ""
753
- # Get identifying columns from the table row to map back to original df row
754
  sel = table.iloc[row_idx]
755
  subj = sel.get("subject", None)
756
  frm = sel.get("from_email", None)
757
  dstr = sel.get("date", None)
758
- # match in original df
759
  cand = df
760
  if subj is not None:
761
  cand = cand[cand["subject"] == subj]
 
38
  VADER_OK = False
39
 
40
  # =================== Regex & Flags ===================
 
41
  TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
 
 
42
  URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
 
 
43
  QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
 
 
44
  SIG_RE = re.compile(r"\n-- ?\n", re.M)
 
 
45
  SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
46
  HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
 
 
47
  FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
48
  FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
49
  ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
 
 
50
  SKIP_LANGDETECT = True
51
 
 
52
  SUSPECT_PHRASES = [
53
  "off the books", "cover up", "kickback", "bribe", "under the table",
54
  "no inspection", "special fee", "friendly payment", "confidential deal",
 
67
  return soup.get_text(separator="\n")
68
 
69
  def strip_quotes_and_sigs(text: str) -> str:
 
70
  if not text:
71
  return ""
72
  text = QUOTE_LINE_RE.sub("", text)
 
73
  parts = SIG_RE.split(text)
74
  if parts:
75
  text = parts[0]
 
76
  text = SENT_FROM_RE.sub("", text)
77
  text = HEBREW_SENT_FROM_RE.sub("", text)
 
78
  cut = None
79
  for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
80
  m = pat.search(text)
 
83
  cut = idx if (cut is None or idx < cut) else cut
84
  if cut is not None:
85
  text = text[:cut]
 
86
  return text.strip()
87
 
88
  def parse_name_email(s: str) -> Tuple[str, str]:
 
89
  if not s:
90
  return "", ""
91
  m = re.match(r'(?:"?([^"]*)"?\s)?<?([^<>]+@[^<>]+)>?', s)
 
94
  return "", s.strip()
95
 
96
  def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
 
 
 
 
97
  headers: Dict[str, str] = {}
98
  lines = (text or "").splitlines()
99
  header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject):')
100
  i = 0
101
  saw_header = False
 
102
  while i < len(lines):
103
  line = lines[i].rstrip("\r")
104
  stripped = line.strip()
 
136
  break
137
  else:
138
  break
 
139
  body_text = "\n".join(lines[i:]) if i < len(lines) else ""
140
  return headers, body_text
141
 
142
  # =================== Normalization & Utilities ===================
143
  def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[str, Any]:
 
144
  if str(raw.get("type", "")).lower() == "meta":
145
  return {}
 
146
  body_text_raw = raw.get("body_text") or raw.get("text") or ""
147
  html_content = raw.get("body_html") or raw.get("html") or ""
148
  if html_content and not body_text_raw:
149
  body_text_raw = html_to_text(html_content)
 
150
  body_text_raw = ftfy.fix_text(body_text_raw or "")
 
151
  subject_text = ""
152
  from_name = from_email = from_domain = ""
153
  date_val = raw.get("date") or raw.get("Date") or ""
 
154
  if body_text_raw:
155
  headers, body_only = parse_email_headers(body_text_raw)
156
  subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
157
  sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
158
  date_val = headers.get("Date", "") or date_val
 
 
159
  body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
160
  body_clean = URL_RE.sub(" URL ", body_clean)
161
  body_clean = re.sub(r"\s+", " ", body_clean).strip()
162
  body_text = body_clean
 
163
  from_name, from_email = parse_name_email(sender)
164
  from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
165
  else:
 
171
  sender = raw.get("from") or raw.get("From") or ""
172
  from_name, from_email = parse_name_email(sender)
173
  from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
 
174
  subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
 
175
  if use_langdetect:
176
  try:
177
  lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
 
179
  lang = "unknown"
180
  else:
181
  lang = "unknown"
 
182
  iso_date = ""
183
  if isinstance(date_val, (int, float)):
184
  try:
 
187
  iso_date = ""
188
  elif isinstance(date_val, str) and date_val:
189
  iso_date = pd.to_datetime(date_val, utc=True, errors="coerce").isoformat()
 
190
  msg_id = raw.get("message_id") or raw.get("Message-ID") or ""
191
  if not msg_id:
192
  msg_id = f"gen-{uuid.uuid4().hex}"
 
193
  thread_key = subject_norm or (from_email + body_text[:120])
194
  thread_id = str(pd.util.hash_pandas_object(pd.Series([thread_key], dtype="string")).astype("uint64").iloc[0])
195
  text_hash = str(pd.util.hash_pandas_object(pd.Series([body_text], dtype="string")).astype("uint64").iloc[0]) if body_text else ""
 
196
  return {
197
  "message_id": str(msg_id),
198
  "thread_id": thread_id,
 
207
  }
208
 
209
  def has_suspect_tag(text: str) -> List[str]:
 
210
  tags = []
211
  if not text:
212
  return tags
213
  low = text.lower()
 
214
  for phrase in SUSPECT_PHRASES:
215
  if phrase in low:
216
+ tags.append("🚩suspect")
217
  break
218
  if "invoice" in low or "payment" in low or "contract" in low:
219
+ tags.append("finance")
220
  if "wire" in low or "transfer" in low or "cash" in low:
221
+ if "finance" not in tags:
222
+ tags.append("finance")
223
+ return tags
224
 
225
  def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
226
  if not VADER_OK:
 
230
  analyzer = SentimentIntensityAnalyzer()
231
  scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
232
  df["sentiment_score"] = scores
 
233
  bins = [-1.01, -0.05, 0.05, 1.01]
234
  labels = ["negative", "neutral", "positive"]
235
  df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
236
  return df
237
 
238
  def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str]=None) -> str:
 
239
  subject = (row.get("subject") or "").strip()
240
  body = (row.get("body_text") or "").strip()
241
  from_email = row.get("from_email") or ""
 
259
 
260
  subject_h = hi(subject)
261
  body_h = hi(body)
 
 
262
  rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
263
  dir_attr = ' dir="rtl"' if rtl else ""
 
 
264
  body_html = body_h.replace("\n", "<br/>")
265
 
266
  tag_html = ""
 
298
  if mask.sum() == 0:
299
  out[int(c)] = f"cluster_{c}"
300
  continue
 
301
  mean_vec = X[mask].mean(axis=0).A1
302
  if mean_vec.size == 0:
303
  out[int(c)] = f"cluster_{c}"
 
309
  return out
310
 
311
  def auto_k_rule(n_docs: int) -> int:
 
312
  return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
313
 
314
  # =================== Gradio UI ===================
 
380
  with gr.Row():
381
  search_query = gr.Textbox(label="Search (keywords, names, etc.)")
382
  search_btn = gr.Button("Search")
383
+ # Removed unsupported `height` arg for older Gradio
384
+ results_df = gr.Dataframe(label="Results (top 500 or top 50 for search)", interactive=True, wrap=True)
385
  email_view = gr.HTML(label="Reader")
386
 
387
  # State
388
+ state_df = gr.State()
389
+ state_vec = gr.State()
390
+ state_X_reduced = gr.State()
391
+ state_index = gr.State()
392
+ state_term_names = gr.State()
393
+ state_query_terms = gr.State()
394
  state_use_lsa = gr.State()
395
  state_use_faiss = gr.State()
396
 
 
431
  start: str, end: str) -> pd.DataFrame:
432
  out = df
433
  if cluster and cluster != "(any)":
 
434
  m = re.match(r"^(\d+)\s+—", cluster)
435
  if m:
436
  cid = int(m.group(1))
 
440
  if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
441
  out = out[out["sentiment"].astype(str) == sentiment]
442
  if tag_value and tag_value != "(any)":
 
443
  out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
 
444
  if start:
445
  try:
446
  dt = pd.to_datetime(start, utc=True, errors="coerce")
 
469
  return ("**No valid records found.**",
470
  None, None, None, None, None, None, None, None, None, None)
471
 
 
472
  normd = []
473
  for r in tqdm(recs, desc="Normalize", leave=False):
474
  out = normalize_email_record(r, use_langdetect=use_lang)
 
479
  return ("**No usable email records after normalization.**",
480
  None, None, None, None, None, None, None, None, None, None)
481
 
 
482
  df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
483
 
 
484
  df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
485
  df = compute_sentiment_column(df)
486
 
 
487
  texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
488
 
 
489
  ngram_range = (1, 2) if use_bigrams else (1, 1)
490
  vec = TfidfVectorizer(
491
  analyzer="word",
 
498
  sublinear_tf=True,
499
  dtype=np.float32,
500
  )
501
+ X = vec.fit_transform(texts)
502
 
 
503
  use_lsa = bool(use_lsa)
504
  X_reduced = None
505
  if use_lsa:
506
  svd = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
507
+ X_reduced_tmp = svd.fit_transform(X)
508
  normalizer = Normalizer(copy=False)
509
  X_reduced = normalizer.fit_transform(X_reduced_tmp).astype(np.float32)
510
  del X_reduced_tmp
511
  gc.collect()
512
 
 
513
  if bool(auto_k):
514
  k = auto_k_rule(X.shape[0])
515
  else:
 
524
  labels = kmeans.fit_predict(X_reduced if use_lsa else X)
525
  df["cluster_id"] = labels
526
 
 
527
  term_names = top_terms_per_cluster(X, labels, vec, topn=6)
528
  df["cluster_name"] = [term_names[int(c)] for c in labels]
529
 
 
530
  use_faiss = bool(use_faiss) and FAISS_OK
531
  index_obj = None
532
  if use_faiss and use_lsa:
 
533
  d = (X_reduced.shape[1])
534
  index_obj = faiss.IndexFlatIP(d)
535
  index_obj.add(X_reduced)
536
  else:
 
537
  nn = NearestNeighbors(metric="cosine", algorithm="brute")
538
  nn.fit(X_reduced if use_lsa else X)
539
  index_obj = nn
540
 
 
541
  cluster_counts = (
542
  df.groupby(["cluster_id", "cluster_name"]).size()
543
  .reset_index(name="count")
544
  .sort_values("count", ascending=False)
545
  .head(500)
546
  )
 
547
  cluster_counts["label"] = cluster_counts.apply(
548
  lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
549
  )
 
557
  )
558
  domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
559
 
 
560
  if "date" in df.columns and df["date"].notna().any():
561
  show_df = df.copy()
 
562
  show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
563
  show_df = show_df.sort_values("_dt", ascending=False).drop(columns=["_dt"])
564
  else:
 
574
  f"k = {k} | Search = {'Faiss (IP on LSA)' if (use_faiss and use_lsa and FAISS_OK) else 'cosine brute-force'}"
575
  )
576
 
 
577
  gc.collect()
578
 
579
  return (status_md,
 
583
  use_lsa, (use_faiss and use_lsa and FAISS_OK),
584
  cluster_choices, domain_choices)
585
 
 
586
  (run_btn.click)(
587
  process_file,
588
  inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang,
 
601
  return pd.DataFrame()
602
  filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
603
  cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
 
604
  if "date" in filt.columns and filt["date"].notna().any():
605
  tmp = filt.copy()
606
  tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
 
628
  def _tokenize_query(q: str) -> List[str]:
629
  if not q:
630
  return []
 
631
  parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
 
632
  seen, out = set(), []
633
  for p in parts:
634
  if p.lower() not in seen:
635
  out.append(p)
636
  seen.add(p.lower())
637
+ return out[:8]
638
 
639
  def search_fn(q, df, vec, X_reduced, index_obj, use_lsa_flag, use_faiss_flag):
640
  if (not q) or (df is None) or (vec is None) or (index_obj is None):
641
  return pd.DataFrame(), []
642
  q_terms = _tokenize_query(q)
 
 
643
  q_vec = vec.transform([q])
 
 
 
 
644
 
 
645
  if isinstance(index_obj, NearestNeighbors):
646
  distances, indices = index_obj.kneighbors(q_vec, n_neighbors=min(50, len(df)))
647
  inds = indices[0]
 
649
  results = df.iloc[inds].copy()
650
  results["score"] = sims
651
  elif FAISS_OK and isinstance(index_obj, faiss.Index):
652
+ # Without persisted SVD/Normalizer, we can't project q to LSA; skip ANN here.
 
653
  return pd.DataFrame(), q_terms
654
  else:
655
  return pd.DataFrame(), q_terms
 
670
  row_idx = evt.index if hasattr(evt, "index") else None
671
  if row_idx is None or table is None or len(table) == 0 or df is None or len(df) == 0:
672
  return ""
 
673
  sel = table.iloc[row_idx]
674
  subj = sel.get("subject", None)
675
  frm = sel.get("from_email", None)
676
  dstr = sel.get("date", None)
 
677
  cand = df
678
  if subj is not None:
679
  cand = cand[cand["subject"] == subj]