wuhp commited on
Commit
c8241c4
·
verified ·
1 Parent(s): c8dbc29

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -52
app.py CHANGED
@@ -25,7 +25,7 @@ from sklearn.preprocessing import Normalizer
25
 
26
  # Optional fast ANN (CPU)
27
  try:
28
- import faiss # faiss-cpu on HF Space
29
  FAISS_OK = True
30
  except Exception:
31
  FAISS_OK = False
@@ -38,17 +38,31 @@ except Exception:
38
  VADER_OK = False
39
 
40
  # =================== Regex & Flags ===================
 
41
  TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
 
 
42
  URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
 
 
43
  QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
 
 
44
  SIG_RE = re.compile(r"\n-- ?\n", re.M)
 
 
45
  SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
46
  HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
 
 
47
  FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
48
- FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
49
- ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
 
 
50
  SKIP_LANGDETECT = True
51
 
 
52
  SUSPECT_PHRASES = [
53
  "off the books", "cover up", "kickback", "bribe", "under the table",
54
  "no inspection", "special fee", "friendly payment", "confidential deal",
@@ -67,14 +81,22 @@ def html_to_text(html: str) -> str:
67
  return soup.get_text(separator="\n")
68
 
69
  def strip_quotes_and_sigs(text: str) -> str:
 
70
  if not text:
71
  return ""
 
72
  text = QUOTE_LINE_RE.sub("", text)
 
 
73
  parts = SIG_RE.split(text)
74
  if parts:
75
  text = parts[0]
76
- text = SENTO_FROM_RE.sub("", text) if (SENT_FROM_RE := SENT_FROM_RE) else text # keep name stable
 
 
77
  text = HEBREW_SENT_FROM_RE.sub("", text)
 
 
78
  cut = None
79
  for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
80
  m = pat.search(text)
@@ -83,9 +105,11 @@ def strip_quotes_and_sigs(text: str) -> str:
83
  cut = idx if (cut is None or idx < cut) else cut
84
  if cut is not None:
85
  text = text[:cut]
 
86
  return text.strip()
87
 
88
  def parse_name_email(s: str) -> Tuple[str, str]:
 
89
  if not s:
90
  return "", ""
91
  m = re.match(r'(?:"?([^"]*)"?\s)?<?([^<>]+@[^<>]+)>?', s)
@@ -94,11 +118,16 @@ def parse_name_email(s: str) -> Tuple[str, str]:
94
  return "", s.strip()
95
 
96
  def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
 
 
 
 
97
  headers: Dict[str, str] = {}
98
  lines = (text or "").splitlines()
99
  header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject):')
100
  i = 0
101
  saw_header = False
 
102
  while i < len(lines):
103
  line = lines[i].rstrip("\r")
104
  stripped = line.strip()
@@ -136,30 +165,39 @@ def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
136
  break
137
  else:
138
  break
 
139
  body_text = "\n".join(lines[i:]) if i < len(lines) else ""
140
  return headers, body_text
141
 
142
  # =================== Normalization & Utilities ===================
143
  def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[str, Any]:
 
144
  if str(raw.get("type", "")).lower() == "meta":
145
  return {}
 
146
  body_text_raw = raw.get("body_text") or raw.get("text") or ""
147
- html_content = raw.get("body_html") or raw.get("html") or ""
148
  if html_content and not body_text_raw:
149
  body_text_raw = html_to_text(html_content)
 
150
  body_text_raw = ftfy.fix_text(body_text_raw or "")
 
151
  subject_text = ""
152
  from_name = from_email = from_domain = ""
153
  date_val = raw.get("date") or raw.get("Date") or ""
 
154
  if body_text_raw:
155
  headers, body_only = parse_email_headers(body_text_raw)
156
  subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
157
  sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
158
  date_val = headers.get("Date", "") or date_val
 
 
159
  body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
160
  body_clean = URL_RE.sub(" URL ", body_clean)
161
  body_clean = re.sub(r"\s+", " ", body_clean).strip()
162
  body_text = body_clean
 
163
  from_name, from_email = parse_name_email(sender)
164
  from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
165
  else:
@@ -171,7 +209,9 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
171
  sender = raw.get("from") or raw.get("From") or ""
172
  from_name, from_email = parse_name_email(sender)
173
  from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
 
174
  subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
 
175
  if use_langdetect:
176
  try:
177
  lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
@@ -179,6 +219,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
179
  lang = "unknown"
180
  else:
181
  lang = "unknown"
 
182
  iso_date = ""
183
  if isinstance(date_val, (int, float)):
184
  try:
@@ -187,12 +228,15 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
187
  iso_date = ""
188
  elif isinstance(date_val, str) and date_val:
189
  iso_date = pd.to_datetime(date_val, utc=True, errors="coerce").isoformat()
 
190
  msg_id = raw.get("message_id") or raw.get("Message-ID") or ""
191
  if not msg_id:
192
  msg_id = f"gen-{uuid.uuid4().hex}"
 
193
  thread_key = subject_norm or (from_email + body_text[:120])
194
  thread_id = str(pd.util.hash_pandas_object(pd.Series([thread_key], dtype="string")).astype("uint64").iloc[0])
195
  text_hash = str(pd.util.hash_pandas_object(pd.Series([body_text], dtype="string")).astype("uint64").iloc[0]) if body_text else ""
 
196
  return {
197
  "message_id": str(msg_id),
198
  "thread_id": thread_id,
@@ -207,6 +251,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
207
  }
208
 
209
  def has_suspect_tag(text: str) -> List[str]:
 
210
  tags = []
211
  if not text:
212
  return tags
@@ -230,17 +275,19 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
230
  analyzer = SentimentIntensityAnalyzer()
231
  scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
232
  df["sentiment_score"] = scores
 
233
  bins = [-1.01, -0.05, 0.05, 1.01]
234
  labels = ["negative", "neutral", "positive"]
235
  df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
236
  return df
237
 
238
- def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str]=None) -> str:
 
239
  subject = (row.get("subject") or "").strip()
240
- body = (row.get("body_text") or "").strip()
241
  from_email = row.get("from_email") or ""
242
- date = row.get("date") or ""
243
- tags = row.get("tags") or []
244
  sentiment = row.get("sentiment") or "(unknown)"
245
 
246
  def hi(text: str) -> str:
@@ -258,9 +305,12 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
258
  return out
259
 
260
  subject_h = hi(subject)
261
- body_h = hi(body)
 
 
262
  rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
263
  dir_attr = ' dir="rtl"' if rtl else ""
 
264
  body_html = body_h.replace("\n", "<br/>")
265
 
266
  tag_html = ""
@@ -294,10 +344,11 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
294
  out = {}
295
  uniq = np.unique(labels)
296
  for c in uniq:
297
- mask = (labels == c)
298
  if mask.sum() == 0:
299
  out[int(c)] = f"cluster_{c}"
300
  continue
 
301
  mean_vec = X[mask].mean(axis=0).A1
302
  if mean_vec.size == 0:
303
  out[int(c)] = f"cluster_{c}"
@@ -309,6 +360,7 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
309
  return out
310
 
311
  def auto_k_rule(n_docs: int) -> int:
 
312
  return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
313
 
314
  # =================== Gradio UI ===================
@@ -365,7 +417,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
365
  )
366
  with gr.Row():
367
  date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
368
- date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
369
 
370
  with gr.Row():
371
  run_btn = gr.Button("Process", variant="primary")
@@ -374,7 +426,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
374
 
375
  with gr.Row():
376
  cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
377
- domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
378
 
379
  gr.Markdown("### Search")
380
  with gr.Row():
@@ -384,16 +436,16 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
384
  email_view = gr.HTML(label="Reader")
385
 
386
  # State
387
- state_df = gr.State()
388
- state_vec = gr.State()
389
- state_X_reduced = gr.State()
390
- state_index = gr.State()
391
- state_term_names = gr.State()
392
- state_query_terms = gr.State()
393
- state_use_lsa = gr.State()
394
- state_use_faiss = gr.State()
395
- state_svd = gr.State()
396
- state_norm = gr.State()
397
 
398
  # -------- IO helpers --------
399
  def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
@@ -424,14 +476,18 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
424
  recs = [obj]
425
  return recs
426
 
427
- def _apply_filters(df: pd.DataFrame,
428
- cluster: Optional[str],
429
- domain: Optional[str],
430
- sentiment: str,
431
- tag_value: str,
432
- start: str, end: str) -> pd.DataFrame:
 
 
 
433
  out = df
434
  if cluster and cluster != "(any)":
 
435
  m = re.match(r"^(\d+)\s+—", cluster)
436
  if m:
437
  cid = int(m.group(1))
@@ -441,7 +497,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
441
  if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
442
  out = out[out["sentiment"].astype(str) == sentiment]
443
  if tag_value and tag_value != "(any)":
 
444
  out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
 
445
  if start:
446
  try:
447
  dt = pd.to_datetime(start, utc=True, errors="coerce")
@@ -461,15 +519,16 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
461
  use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss):
462
  if inbox_file is None:
463
  return ("**Please upload a file.**",
464
- None, None, None, None, None, None, None, None, None, None, None, None)
465
 
466
  use_lang = not bool(skip_lang)
467
 
468
  recs = _load_json_records(inbox_file.name)
469
  if not recs:
470
  return ("**No valid records found.**",
471
- None, None, None, None, None, None, None, None, None, None, None, None)
472
 
 
473
  normd = []
474
  for r in tqdm(recs, desc="Normalize", leave=False):
475
  out = normalize_email_record(r, use_langdetect=use_lang)
@@ -478,15 +537,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
478
  df = pd.DataFrame(normd)
479
  if df.empty:
480
  return ("**No usable email records after normalization.**",
481
- None, None, None, None, None, None, None, None, None, None, None, None)
482
 
 
483
  df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
484
 
 
485
  df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
486
  df = compute_sentiment_column(df)
487
 
 
488
  texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
489
 
 
490
  ngram_range = (1, 2) if use_bigrams else (1, 1)
491
  vec = TfidfVectorizer(
492
  analyzer="word",
@@ -499,20 +562,22 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
499
  sublinear_tf=True,
500
  dtype=np.float32,
501
  )
502
- X = vec.fit_transform(texts)
503
 
 
504
  use_lsa = bool(use_lsa)
505
  X_reduced = None
506
  svd_obj = None
507
  norm_obj = None
508
  if use_lsa:
509
  svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
510
- X_reduced_tmp = svd_obj.fit_transform(X)
511
  norm_obj = Normalizer(copy=False)
512
  X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
513
  del X_reduced_tmp
514
  gc.collect()
515
 
 
516
  if bool(auto_k):
517
  k = auto_k_rule(X.shape[0])
518
  else:
@@ -527,26 +592,32 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
527
  labels = kmeans.fit_predict(X_reduced if use_lsa else X)
528
  df["cluster_id"] = labels
529
 
 
530
  term_names = top_terms_per_cluster(X, labels, vec, topn=6)
531
  df["cluster_name"] = [term_names[int(c)] for c in labels]
532
 
 
533
  use_faiss = bool(use_faiss) and FAISS_OK
534
  index_obj = None
535
  if use_faiss and use_lsa:
536
- d = (X_reduced.shape[1])
 
537
  index_obj = faiss.IndexFlatIP(d)
538
  index_obj.add(X_reduced)
539
  else:
 
540
  nn = NearestNeighbors(metric="cosine", algorithm="brute")
541
  nn.fit(X_reduced if use_lsa else X)
542
  index_obj = nn
543
 
 
544
  cluster_counts = (
545
  df.groupby(["cluster_id", "cluster_name"]).size()
546
  .reset_index(name="count")
547
  .sort_values("count", ascending=False)
548
  .head(500)
549
  )
 
550
  cluster_counts["label"] = cluster_counts.apply(
551
  lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
552
  )
@@ -560,8 +631,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
560
  )
561
  domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
562
 
 
563
  if "date" in df.columns and df["date"].notna().any():
564
  show_df = df.copy()
 
565
  show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
566
  show_df = show_df.sort_values("_dt", ascending=False).drop(columns=["_dt"])
567
  else:
@@ -579,17 +652,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
579
 
580
  gc.collect()
581
 
582
- # IMPORTANT: use gr.update to set dropdown choices + default values
583
  cluster_update = gr.update(choices=cluster_choices, value="(any)")
584
- domain_update = gr.update(choices=domain_choices, value="(any)")
585
-
586
- return (status_md,
587
- cluster_counts, domain_counts,
588
- out_table,
589
- df, vec, (X_reduced if use_lsa else None), index_obj, term_names,
590
- use_lsa, (use_faiss and use_lsa and FAISS_OK),
591
- cluster_update, domain_update,
592
- svd_obj, norm_obj)
 
 
593
 
594
  (run_btn.click)(
595
  process_file,
@@ -610,6 +685,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
610
  return pd.DataFrame()
611
  filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
612
  cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
 
613
  if "date" in filt.columns and filt["date"].notna().any():
614
  tmp = filt.copy()
615
  tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
@@ -624,8 +700,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
624
  outputs=[results_df]
625
  )
626
 
 
627
  reset_btn.click(
628
- lambda: ["(any)", "(any)", "(any)", "(any)", "", ""],
629
  inputs=[],
630
  outputs=[cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]
631
  ).then(
@@ -637,18 +714,20 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
637
  def _tokenize_query(q: str) -> List[str]:
638
  if not q:
639
  return []
 
640
  parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
 
641
  seen, out = set(), []
642
  for p in parts:
643
  if p.lower() not in seen:
644
  out.append(p)
645
  seen.add(p.lower())
646
- return out[:8]
647
 
648
  def _project_query_to_lsa(q_vec, svd_obj, norm_obj) -> Optional[np.ndarray]:
649
  try:
650
- q_red = svd_obj.transform(q_vec) # (1, lsa_dim)
651
- q_red = norm_obj.transform(q_red) # normalize
652
  return q_red.astype(np.float32)
653
  except Exception:
654
  return None
@@ -657,6 +736,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
657
  if (not q) or (df is None) or (vec is None) or (index_obj is None):
658
  return pd.DataFrame(), []
659
  q_terms = _tokenize_query(q)
 
 
660
  q_vec = vec.transform([q])
661
 
662
  # Decide which space the index uses and project accordingly
@@ -691,17 +772,19 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
691
  outputs=[results_df, state_query_terms]
692
  )
693
 
694
- def on_row_select(evt: gr.SelectData, table: pd.DataFrame, df: pd.DataFrame, term_names: Dict[int,str], query_terms: Optional[List[str]]):
695
  try:
696
  row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
697
  except Exception:
698
  row_idx = evt.index if hasattr(evt, "index") else None
699
  if row_idx is None or table is None or len(table) == 0 or df is None or len(df) == 0:
700
  return ""
 
701
  sel = table.iloc[row_idx]
702
  subj = sel.get("subject", None)
703
- frm = sel.get("from_email", None)
704
  dstr = sel.get("date", None)
 
705
  cand = df
706
  if subj is not None:
707
  cand = cand[cand["subject"] == subj]
@@ -710,7 +793,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
710
  if dstr is not None:
711
  cand = cand[cand["date"] == dstr]
712
  if len(cand) == 0:
713
- cand = df[df["subject"] == sel.get("subject","")]
714
  if len(cand) == 0:
715
  return ""
716
  row = cand.iloc[0]
 
25
 
26
  # Optional fast ANN (CPU)
27
  try:
28
+ import faiss # faiss-cpu on HF Space
29
  FAISS_OK = True
30
  except Exception:
31
  FAISS_OK = False
 
38
  VADER_OK = False
39
 
40
  # =================== Regex & Flags ===================
41
+ # Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
42
  TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
43
+
44
+ # URLs -> "URL" (reduce feature bloat). We DO NOT redact phone numbers per your request.
45
  URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
46
+
47
+ # Quote lines ("> ...")
48
  QUOTE_LINE_RE = re.compile(r"^>.*$", re.M)
49
+
50
+ # Signature separator: lines after "-- " (standard)
51
  SIG_RE = re.compile(r"\n-- ?\n", re.M)
52
+
53
+ # Device footers
54
  SENT_FROM_RE = re.compile(r"\nSent from my .*$", re.M)
55
  HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
56
+
57
+ # Forward/quoted markers
58
  FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
59
+ FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
60
+ ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
61
+
62
+ # Toggle for language detection (skip for speed)
63
  SKIP_LANGDETECT = True
64
 
65
+ # Corruption keyword/phrase list (you can extend freely)
66
  SUSPECT_PHRASES = [
67
  "off the books", "cover up", "kickback", "bribe", "under the table",
68
  "no inspection", "special fee", "friendly payment", "confidential deal",
 
81
  return soup.get_text(separator="\n")
82
 
83
  def strip_quotes_and_sigs(text: str) -> str:
84
+ """Drop quoted lines, signatures, device footers, forwarded chains."""
85
  if not text:
86
  return ""
87
+ # remove > quoted lines
88
  text = QUOTE_LINE_RE.sub("", text)
89
+
90
+ # cut everything after signature separator
91
  parts = SIG_RE.split(text)
92
  if parts:
93
  text = parts[0]
94
+
95
+ # remove device footers
96
+ text = SENT_FROM_RE.sub("", text)
97
  text = HEBREW_SENT_FROM_RE.sub("", text)
98
+
99
+ # trim forwarded/quoted chains
100
  cut = None
101
  for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
102
  m = pat.search(text)
 
105
  cut = idx if (cut is None or idx < cut) else cut
106
  if cut is not None:
107
  text = text[:cut]
108
+
109
  return text.strip()
110
 
111
  def parse_name_email(s: str) -> Tuple[str, str]:
112
+ """Split 'Name <email>' into (name, email)."""
113
  if not s:
114
  return "", ""
115
  m = re.match(r'(?:"?([^"]*)"?\s)?<?([^<>]+@[^<>]+)>?', s)
 
118
  return "", s.strip()
119
 
120
  def parse_email_headers(text: str) -> Tuple[Dict[str, str], str]:
121
+ """
122
+ Extract inline headers (From, To, CC, Date, Subject) from the text blob.
123
+ Returns (headers_dict, remaining_body_text).
124
+ """
125
  headers: Dict[str, str] = {}
126
  lines = (text or "").splitlines()
127
  header_pat = re.compile(r'^(From|To|Cc|CC|Bcc|Date|Subject):')
128
  i = 0
129
  saw_header = False
130
+
131
  while i < len(lines):
132
  line = lines[i].rstrip("\r")
133
  stripped = line.strip()
 
165
  break
166
  else:
167
  break
168
+
169
  body_text = "\n".join(lines[i:]) if i < len(lines) else ""
170
  return headers, body_text
171
 
172
  # =================== Normalization & Utilities ===================
173
  def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[str, Any]:
174
+ """Normalize a single raw record into a structured row."""
175
  if str(raw.get("type", "")).lower() == "meta":
176
  return {}
177
+
178
  body_text_raw = raw.get("body_text") or raw.get("text") or ""
179
+ html_content = raw.get("body_html") or raw.get("html") or ""
180
  if html_content and not body_text_raw:
181
  body_text_raw = html_to_text(html_content)
182
+
183
  body_text_raw = ftfy.fix_text(body_text_raw or "")
184
+
185
  subject_text = ""
186
  from_name = from_email = from_domain = ""
187
  date_val = raw.get("date") or raw.get("Date") or ""
188
+
189
  if body_text_raw:
190
  headers, body_only = parse_email_headers(body_text_raw)
191
  subject_text = headers.get("Subject", "") or raw.get("subject") or raw.get("Subject") or ""
192
  sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
193
  date_val = headers.get("Date", "") or date_val
194
+
195
+ # Clean body: NO phone redaction, per your request
196
  body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
197
  body_clean = URL_RE.sub(" URL ", body_clean)
198
  body_clean = re.sub(r"\s+", " ", body_clean).strip()
199
  body_text = body_clean
200
+
201
  from_name, from_email = parse_name_email(sender)
202
  from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
203
  else:
 
209
  sender = raw.get("from") or raw.get("From") or ""
210
  from_name, from_email = parse_name_email(sender)
211
  from_domain = from_email.split("@")[-1].lower() if "@" in from_email else ""
212
+
213
  subject_norm = re.sub(r"\s+", " ", subject_text or "").strip()
214
+
215
  if use_langdetect:
216
  try:
217
  lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
 
219
  lang = "unknown"
220
  else:
221
  lang = "unknown"
222
+
223
  iso_date = ""
224
  if isinstance(date_val, (int, float)):
225
  try:
 
228
  iso_date = ""
229
  elif isinstance(date_val, str) and date_val:
230
  iso_date = pd.to_datetime(date_val, utc=True, errors="coerce").isoformat()
231
+
232
  msg_id = raw.get("message_id") or raw.get("Message-ID") or ""
233
  if not msg_id:
234
  msg_id = f"gen-{uuid.uuid4().hex}"
235
+
236
  thread_key = subject_norm or (from_email + body_text[:120])
237
  thread_id = str(pd.util.hash_pandas_object(pd.Series([thread_key], dtype="string")).astype("uint64").iloc[0])
238
  text_hash = str(pd.util.hash_pandas_object(pd.Series([body_text], dtype="string")).astype("uint64").iloc[0]) if body_text else ""
239
+
240
  return {
241
  "message_id": str(msg_id),
242
  "thread_id": thread_id,
 
251
  }
252
 
253
  def has_suspect_tag(text: str) -> List[str]:
254
+ """Return list of corruption/suspicion tags present in text."""
255
  tags = []
256
  if not text:
257
  return tags
 
275
  analyzer = SentimentIntensityAnalyzer()
276
  scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
277
  df["sentiment_score"] = scores
278
+ # VADER thresholds: [-1,-0.05), (-0.05,0.05), (0.05,1]
279
  bins = [-1.01, -0.05, 0.05, 1.01]
280
  labels = ["negative", "neutral", "positive"]
281
  df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
282
  return df
283
 
284
+ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str] = None) -> str:
285
+ """Email reader HTML with highlighted query terms and visible tags."""
286
  subject = (row.get("subject") or "").strip()
287
+ body = (row.get("body_text") or "").strip()
288
  from_email = row.get("from_email") or ""
289
+ date = row.get("date") or ""
290
+ tags = row.get("tags") or []
291
  sentiment = row.get("sentiment") or "(unknown)"
292
 
293
  def hi(text: str) -> str:
 
305
  return out
306
 
307
  subject_h = hi(subject)
308
+ body_h = hi(body)
309
+
310
+ # Basic RTL detection for Hebrew/Arabic chars → add dir="rtl"
311
  rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
312
  dir_attr = ' dir="rtl"' if rtl else ""
313
+
314
  body_html = body_h.replace("\n", "<br/>")
315
 
316
  tag_html = ""
 
344
  out = {}
345
  uniq = np.unique(labels)
346
  for c in uniq:
347
+ mask = labels == c
348
  if mask.sum() == 0:
349
  out[int(c)] = f"cluster_{c}"
350
  continue
351
+ # mean TF-IDF per feature inside cluster
352
  mean_vec = X[mask].mean(axis=0).A1
353
  if mean_vec.size == 0:
354
  out[int(c)] = f"cluster_{c}"
 
360
  return out
361
 
362
  def auto_k_rule(n_docs: int) -> int:
363
+ # Sublinear scaling; keeps clusters between ~120 and 600 for big corpora
364
  return int(max(120, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 110)))
365
 
366
  # =================== Gradio UI ===================
 
417
  )
418
  with gr.Row():
419
  date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
420
+ date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
421
 
422
  with gr.Row():
423
  run_btn = gr.Button("Process", variant="primary")
 
426
 
427
  with gr.Row():
428
  cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
429
+ domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
430
 
431
  gr.Markdown("### Search")
432
  with gr.Row():
 
436
  email_view = gr.HTML(label="Reader")
437
 
438
  # State
439
+ state_df = gr.State() # full dataframe
440
+ state_vec = gr.State() # TfidfVectorizer
441
+ state_X_reduced = gr.State() # np.ndarray (LSA normalized) or None
442
+ state_index = gr.State() # Faiss index or sklearn NN
443
+ state_term_names = gr.State() # dict cluster_id -> label
444
+ state_query_terms = gr.State() # last search terms list
445
+ state_use_lsa = gr.State()
446
+ state_use_faiss = gr.State()
447
+ state_svd = gr.State()
448
+ state_norm = gr.State()
449
 
450
  # -------- IO helpers --------
451
  def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
 
476
  recs = [obj]
477
  return recs
478
 
479
+ def _apply_filters(
480
+ df: pd.DataFrame,
481
+ cluster: Optional[str],
482
+ domain: Optional[str],
483
+ sentiment: str,
484
+ tag_value: str,
485
+ start: str,
486
+ end: str,
487
+ ) -> pd.DataFrame:
488
  out = df
489
  if cluster and cluster != "(any)":
490
+ # cluster values like "12 — payment, contract (534)"
491
  m = re.match(r"^(\d+)\s+—", cluster)
492
  if m:
493
  cid = int(m.group(1))
 
497
  if sentiment and sentiment != "(any)" and "sentiment" in out.columns:
498
  out = out[out["sentiment"].astype(str) == sentiment]
499
  if tag_value and tag_value != "(any)":
500
+ # tags is a list; check membership robustly
501
  out = out[out["tags"].apply(lambda ts: isinstance(ts, list) and (tag_value in ts))]
502
+ # date bounds
503
  if start:
504
  try:
505
  dt = pd.to_datetime(start, utc=True, errors="coerce")
 
519
  use_lsa, lsa_dim, auto_k, k_clusters, mb_batch, use_faiss):
520
  if inbox_file is None:
521
  return ("**Please upload a file.**",
522
+ None, None, None, None, None, None, None, None, None, None, None, None, None, None)
523
 
524
  use_lang = not bool(skip_lang)
525
 
526
  recs = _load_json_records(inbox_file.name)
527
  if not recs:
528
  return ("**No valid records found.**",
529
+ None, None, None, None, None, None, None, None, None, None, None, None, None, None)
530
 
531
+ # Normalize
532
  normd = []
533
  for r in tqdm(recs, desc="Normalize", leave=False):
534
  out = normalize_email_record(r, use_langdetect=use_lang)
 
537
  df = pd.DataFrame(normd)
538
  if df.empty:
539
  return ("**No usable email records after normalization.**",
540
+ None, None, None, None, None, None, None, None, None, None, None, None, None, None)
541
 
542
+ # Deduplicate conservatively
543
  df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
544
 
545
+ # Tags (suspect/finance) + Sentiment
546
  df["tags"] = df["body_text"].fillna("").map(has_suspect_tag)
547
  df = compute_sentiment_column(df)
548
 
549
+ # Texts for modeling
550
  texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
551
 
552
+ # TF-IDF (sparse CSR float32)
553
  ngram_range = (1, 2) if use_bigrams else (1, 1)
554
  vec = TfidfVectorizer(
555
  analyzer="word",
 
562
  sublinear_tf=True,
563
  dtype=np.float32,
564
  )
565
+ X = vec.fit_transform(texts) # CSR float32
566
 
567
+ # LSA (TruncatedSVD + Normalizer) for stability/quality
568
  use_lsa = bool(use_lsa)
569
  X_reduced = None
570
  svd_obj = None
571
  norm_obj = None
572
  if use_lsa:
573
  svd_obj = TruncatedSVD(n_components=int(lsa_dim or 150), random_state=0)
574
+ X_reduced_tmp = svd_obj.fit_transform(X) # dense (n_docs x lsa_dim)
575
  norm_obj = Normalizer(copy=False)
576
  X_reduced = norm_obj.fit_transform(X_reduced_tmp).astype(np.float32)
577
  del X_reduced_tmp
578
  gc.collect()
579
 
580
+ # KMeans clustering
581
  if bool(auto_k):
582
  k = auto_k_rule(X.shape[0])
583
  else:
 
592
  labels = kmeans.fit_predict(X_reduced if use_lsa else X)
593
  df["cluster_id"] = labels
594
 
595
+ # Name clusters by top terms (use original TF-IDF for interpretability)
596
  term_names = top_terms_per_cluster(X, labels, vec, topn=6)
597
  df["cluster_name"] = [term_names[int(c)] for c in labels]
598
 
599
+ # Build search index
600
  use_faiss = bool(use_faiss) and FAISS_OK
601
  index_obj = None
602
  if use_faiss and use_lsa:
603
+ # cosine ≈ inner product on normalized vectors
604
+ d = X_reduced.shape[1]
605
  index_obj = faiss.IndexFlatIP(d)
606
  index_obj.add(X_reduced)
607
  else:
608
+ # fallback to brute-force cosine on TF-IDF or reduced vectors
609
  nn = NearestNeighbors(metric="cosine", algorithm="brute")
610
  nn.fit(X_reduced if use_lsa else X)
611
  index_obj = nn
612
 
613
+ # Summaries
614
  cluster_counts = (
615
  df.groupby(["cluster_id", "cluster_name"]).size()
616
  .reset_index(name="count")
617
  .sort_values("count", ascending=False)
618
  .head(500)
619
  )
620
+ # For dropdown labels: "id — label (count)"
621
  cluster_counts["label"] = cluster_counts.apply(
622
  lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
623
  )
 
631
  )
632
  domain_choices = ["(any)"] + domain_counts["from_domain"].tolist()
633
 
634
+ # Results preview default (latest 500 by date if available)
635
  if "date" in df.columns and df["date"].notna().any():
636
  show_df = df.copy()
637
+ # coerce to datetime for sort
638
  show_df["_dt"] = pd.to_datetime(show_df["date"], utc=True, errors="coerce")
639
  show_df = show_df.sort_values("_dt", ascending=False).drop(columns=["_dt"])
640
  else:
 
652
 
653
  gc.collect()
654
 
655
+ # Use gr.update to set dropdown choices + default values safely
656
  cluster_update = gr.update(choices=cluster_choices, value="(any)")
657
+ domain_update = gr.update(choices=domain_choices, value="(any)")
658
+
659
+ return (
660
+ status_md,
661
+ cluster_counts, domain_counts,
662
+ out_table,
663
+ df, vec, (X_reduced if use_lsa else None), index_obj, term_names,
664
+ use_lsa, (use_faiss and use_lsa and FAISS_OK),
665
+ cluster_update, domain_update,
666
+ svd_obj, norm_obj
667
+ )
668
 
669
  (run_btn.click)(
670
  process_file,
 
685
  return pd.DataFrame()
686
  filt = _apply_filters(df, cluster_choice, domain_choice, sentiment_choice, tag_choice, start, end)
687
  cols_out = ["date", "from_email", "from_domain", "subject", "cluster_name", "tags", "sentiment"]
688
+ # default: sort by date desc if possible
689
  if "date" in filt.columns and filt["date"].notna().any():
690
  tmp = filt.copy()
691
  tmp["_dt"] = pd.to_datetime(tmp["date"], utc=True, errors="coerce")
 
700
  outputs=[results_df]
701
  )
702
 
703
+ # Safer reset: set dropdowns to None (always valid), others to defaults
704
  reset_btn.click(
705
+ lambda: [None, None, "(any)", "(any)", "", ""],
706
  inputs=[],
707
  outputs=[cluster_drop, domain_drop, sentiment_drop, tag_drop, date_start, date_end]
708
  ).then(
 
714
  def _tokenize_query(q: str) -> List[str]:
715
  if not q:
716
  return []
717
+ # split on spaces, keep simple tokens; short stop words aren’t filtered to keep behavior explicit
718
  parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
719
+ # dedupe while preserving order
720
  seen, out = set(), []
721
  for p in parts:
722
  if p.lower() not in seen:
723
  out.append(p)
724
  seen.add(p.lower())
725
+ return out[:8] # limit highlights for performance
726
 
727
  def _project_query_to_lsa(q_vec, svd_obj, norm_obj) -> Optional[np.ndarray]:
728
  try:
729
+ q_red = svd_obj.transform(q_vec) # (1, lsa_dim)
730
+ q_red = norm_obj.transform(q_red) # normalize
731
  return q_red.astype(np.float32)
732
  except Exception:
733
  return None
 
736
  if (not q) or (df is None) or (vec is None) or (index_obj is None):
737
  return pd.DataFrame(), []
738
  q_terms = _tokenize_query(q)
739
+
740
+ # Vectorize the query
741
  q_vec = vec.transform([q])
742
 
743
  # Decide which space the index uses and project accordingly
 
772
  outputs=[results_df, state_query_terms]
773
  )
774
 
775
+ def on_row_select(evt: gr.SelectData, table: pd.DataFrame, df: pd.DataFrame, term_names: Dict[int, str], query_terms: Optional[List[str]]):
776
  try:
777
  row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
778
  except Exception:
779
  row_idx = evt.index if hasattr(evt, "index") else None
780
  if row_idx is None or table is None or len(table) == 0 or df is None or len(df) == 0:
781
  return ""
782
+ # Get identifying columns from the table row to map back to original df row
783
  sel = table.iloc[row_idx]
784
  subj = sel.get("subject", None)
785
+ frm = sel.get("from_email", None)
786
  dstr = sel.get("date", None)
787
+ # match in original df
788
  cand = df
789
  if subj is not None:
790
  cand = cand[cand["subject"] == subj]
 
793
  if dstr is not None:
794
  cand = cand[cand["date"] == dstr]
795
  if len(cand) == 0:
796
+ cand = df[df["subject"] == sel.get("subject", "")]
797
  if len(cand) == 0:
798
  return ""
799
  row = cand.iloc[0]