wuhp commited on
Commit
7a250cb
·
verified ·
1 Parent(s): c8241c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -42
app.py CHANGED
@@ -41,7 +41,7 @@ except Exception:
41
  # Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
42
  TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
43
 
44
- # URLs -> "URL" (reduce feature bloat). We DO NOT redact phone numbers per your request.
45
  URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
46
 
47
  # Quote lines ("> ...")
@@ -56,8 +56,8 @@ HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
56
 
57
  # Forward/quoted markers
58
  FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
59
- FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
60
- ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
61
 
62
  # Toggle for language detection (skip for speed)
63
  SKIP_LANGDETECT = True
@@ -71,6 +71,66 @@ SUSPECT_PHRASES = [
71
  "contract splitting", "grease payment", "unreported", "unrecorded",
72
  ]
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  # =================== HTML/Text Cleanup ===================
75
  def html_to_text(html: str) -> str:
76
  if not html:
@@ -86,16 +146,13 @@ def strip_quotes_and_sigs(text: str) -> str:
86
  return ""
87
  # remove > quoted lines
88
  text = QUOTE_LINE_RE.sub("", text)
89
-
90
  # cut everything after signature separator
91
  parts = SIG_RE.split(text)
92
  if parts:
93
  text = parts[0]
94
-
95
  # remove device footers
96
  text = SENT_FROM_RE.sub("", text)
97
  text = HEBREW_SENT_FROM_RE.sub("", text)
98
-
99
  # trim forwarded/quoted chains
100
  cut = None
101
  for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
@@ -105,7 +162,6 @@ def strip_quotes_and_sigs(text: str) -> str:
105
  cut = idx if (cut is None or idx < cut) else cut
106
  if cut is not None:
107
  text = text[:cut]
108
-
109
  return text.strip()
110
 
111
  def parse_name_email(s: str) -> Tuple[str, str]:
@@ -176,7 +232,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
176
  return {}
177
 
178
  body_text_raw = raw.get("body_text") or raw.get("text") or ""
179
- html_content = raw.get("body_html") or raw.get("html") or ""
180
  if html_content and not body_text_raw:
181
  body_text_raw = html_to_text(html_content)
182
 
@@ -192,7 +248,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
192
  sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
193
  date_val = headers.get("Date", "") or date_val
194
 
195
- # Clean body: NO phone redaction, per your request
196
  body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
197
  body_clean = URL_RE.sub(" URL ", body_clean)
198
  body_clean = re.sub(r"\s+", " ", body_clean).strip()
@@ -274,20 +330,20 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
274
  return df
275
  analyzer = SentimentIntensityAnalyzer()
276
  scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
277
- df["sentiment_score"] = scores
278
  # VADER thresholds: [-1,-0.05), (-0.05,0.05), (0.05,1]
279
  bins = [-1.01, -0.05, 0.05, 1.01]
280
  labels = ["negative", "neutral", "positive"]
 
281
  df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
282
  return df
283
 
284
  def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str] = None) -> str:
285
  """Email reader HTML with highlighted query terms and visible tags."""
286
  subject = (row.get("subject") or "").strip()
287
- body = (row.get("body_text") or "").strip()
288
  from_email = row.get("from_email") or ""
289
- date = row.get("date") or ""
290
- tags = row.get("tags") or []
291
  sentiment = row.get("sentiment") or "(unknown)"
292
 
293
  def hi(text: str) -> str:
@@ -305,12 +361,11 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
305
  return out
306
 
307
  subject_h = hi(subject)
308
- body_h = hi(body)
309
 
310
  # Basic RTL detection for Hebrew/Arabic chars → add dir="rtl"
311
  rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
312
  dir_attr = ' dir="rtl"' if rtl else ""
313
-
314
  body_html = body_h.replace("\n", "<br/>")
315
 
316
  tag_html = ""
@@ -344,7 +399,7 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
344
  out = {}
345
  uniq = np.unique(labels)
346
  for c in uniq:
347
- mask = labels == c
348
  if mask.sum() == 0:
349
  out[int(c)] = f"cluster_{c}"
350
  continue
@@ -353,9 +408,10 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
353
  if mean_vec.size == 0:
354
  out[int(c)] = f"cluster_{c}"
355
  continue
356
- idx = np.argpartition(mean_vec, -topn)[-topn:]
357
- idx = idx[np.argsort(-mean_vec[idx])]
358
- terms = [names[i] for i in idx if mean_vec[i] > 0]
 
359
  out[int(c)] = ", ".join(terms) if terms else f"cluster_{c}"
360
  return out
361
 
@@ -365,18 +421,20 @@ def auto_k_rule(n_docs: int) -> int:
365
 
366
  # =================== Gradio UI ===================
367
  CSS = """
368
- :root { --pill:#eef2ff; --pill-text:#3730a3; --tag:#eee; --tag-text:#444;}
369
- .email-card { background:#fff; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.06); }
370
  .email-header { display:flex; align-items:flex-start; justify-content:space-between; gap:12px; }
371
- .subject { font-size:18px; font-weight:700; margin-bottom:6px; }
372
- .meta { color:#666; font-size:12px; }
373
  .badges { display:flex; gap:8px; align-items:center; flex-wrap:wrap; }
374
  .cluster-pill { background:var(--pill); color:var(--pill-text); padding:2px 8px; border-radius:999px; font-size:12px; }
375
- .sentiment { font-size:12px; color:#555; }
376
  .tag { background:var(--tag); color:var(--tag-text); padding:2px 6px; border-radius:6px; font-size:12px; }
377
- .email-body { margin-top:12px; max-height:520px; overflow:auto; line-height:1.5; white-space:normal; }
 
 
378
  hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
379
- .small { color:#666; font-size:12px; }
380
  """
381
 
382
  with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
@@ -417,7 +475,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
417
  )
418
  with gr.Row():
419
  date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
420
- date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
421
 
422
  with gr.Row():
423
  run_btn = gr.Button("Process", variant="primary")
@@ -426,7 +484,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
426
 
427
  with gr.Row():
428
  cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
429
- domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
430
 
431
  gr.Markdown("### Search")
432
  with gr.Row():
@@ -436,16 +494,16 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
436
  email_view = gr.HTML(label="Reader")
437
 
438
  # State
439
- state_df = gr.State() # full dataframe
440
- state_vec = gr.State() # TfidfVectorizer
441
- state_X_reduced = gr.State() # np.ndarray (LSA normalized) or None
442
- state_index = gr.State() # Faiss index or sklearn NN
443
- state_term_names = gr.State() # dict cluster_id -> label
444
- state_query_terms = gr.State() # last search terms list
445
- state_use_lsa = gr.State()
446
- state_use_faiss = gr.State()
447
- state_svd = gr.State()
448
- state_norm = gr.State()
449
 
450
  # -------- IO helpers --------
451
  def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
@@ -654,7 +712,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
654
 
655
  # Use gr.update to set dropdown choices + default values safely
656
  cluster_update = gr.update(choices=cluster_choices, value="(any)")
657
- domain_update = gr.update(choices=domain_choices, value="(any)")
658
 
659
  return (
660
  status_md,
@@ -714,9 +772,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
714
  def _tokenize_query(q: str) -> List[str]:
715
  if not q:
716
  return []
717
- # split on spaces, keep simple tokens; short stop words aren’t filtered to keep behavior explicit
718
  parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
719
- # dedupe while preserving order
720
  seen, out = set(), []
721
  for p in parts:
722
  if p.lower() not in seen:
@@ -782,7 +839,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
782
  # Get identifying columns from the table row to map back to original df row
783
  sel = table.iloc[row_idx]
784
  subj = sel.get("subject", None)
785
- frm = sel.get("from_email", None)
786
  dstr = sel.get("date", None)
787
  # match in original df
788
  cand = df
 
41
  # Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
42
  TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
43
 
44
+ # URLs -> "URL" (reduce feature bloat).
45
  URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
46
 
47
  # Quote lines ("> ...")
 
56
 
57
  # Forward/quoted markers
58
  FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
59
+ FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
60
+ ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
61
 
62
  # Toggle for language detection (skip for speed)
63
  SKIP_LANGDETECT = True
 
71
  "contract splitting", "grease payment", "unreported", "unrecorded",
72
  ]
73
 
74
+ # =================== Label cleanup helpers ===================
75
+ EN_STOP = {
76
+ "the","of","and","to","in","is","for","on","at","with","from","by","or","as",
77
+ "that","this","it","be","are","was","were","an","a","you","your","we","our","us",
78
+ "re","fwd","fw","hi","hello","thanks","thank","regards","best","please","dear","mr","mrs",
79
+ "message","original","forwarded","attached","attachment","confidential","notice","disclaimer",
80
+ "herein","thereof","hereby","therein","regarding","subject","url","via","kind","regard",
81
+ "ny" # short common noise in your set
82
+ }
83
+ HE_STOP = {
84
+ "של","על","זה","גם","אם","לא","את","אתה","אני","הוא","היא","הם","הן","כי","מה",
85
+ "שלום","תודה","בברכה","מצורף","הודעה","קדימה","היי"
86
+ }
87
+ MONTHS = {
88
+ "jan","feb","mar","apr","may","jun","jul","aug","sep","sept","oct","nov","dec",
89
+ "january","february","march","april","june","july","august","september",
90
+ "october","november","december"
91
+ }
92
+ EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
93
+ YEAR_RE = re.compile(r"^(19|20)\d{2}$")
94
+ NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
95
+ ONE_CHAR_RE = re.compile(r"^.$")
96
+
97
+ def _is_junk_term(t: str) -> bool:
98
+ tl = t.lower()
99
+ if tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
100
+ return True
101
+ if EMAIL_LIKE_RE.search(tl):
102
+ return True
103
+ if YEAR_RE.match(tl):
104
+ return True
105
+ if NUMERIC_RE.match(tl):
106
+ return True
107
+ if ONE_CHAR_RE.match(tl):
108
+ return True
109
+ return False
110
+
111
+ def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
112
+ # Keep order by descending weight in idxs
113
+ ordered = idxs[np.argsort(-mean_vec[idxs])]
114
+ cleaned = []
115
+ for i in ordered:
116
+ term = names[i]
117
+ if _is_junk_term(term):
118
+ continue
119
+ cleaned.append(term)
120
+ if len(cleaned) >= want:
121
+ break
122
+ # If we filtered too hard, allow some not-too-bad tokens (but still avoid email-like)
123
+ if len(cleaned) < max(2, want//2):
124
+ for i in ordered:
125
+ term = names[i]
126
+ if EMAIL_LIKE_RE.search(term) or YEAR_RE.match(term.lower()):
127
+ continue
128
+ if term not in cleaned:
129
+ cleaned.append(term)
130
+ if len(cleaned) >= want:
131
+ break
132
+ return cleaned
133
+
134
  # =================== HTML/Text Cleanup ===================
135
  def html_to_text(html: str) -> str:
136
  if not html:
 
146
  return ""
147
  # remove > quoted lines
148
  text = QUOTE_LINE_RE.sub("", text)
 
149
  # cut everything after signature separator
150
  parts = SIG_RE.split(text)
151
  if parts:
152
  text = parts[0]
 
153
  # remove device footers
154
  text = SENT_FROM_RE.sub("", text)
155
  text = HEBREW_SENT_FROM_RE.sub("", text)
 
156
  # trim forwarded/quoted chains
157
  cut = None
158
  for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
 
162
  cut = idx if (cut is None or idx < cut) else cut
163
  if cut is not None:
164
  text = text[:cut]
 
165
  return text.strip()
166
 
167
  def parse_name_email(s: str) -> Tuple[str, str]:
 
232
  return {}
233
 
234
  body_text_raw = raw.get("body_text") or raw.get("text") or ""
235
+ html_content = raw.get("body_html") or raw.get("html") or ""
236
  if html_content and not body_text_raw:
237
  body_text_raw = html_to_text(html_content)
238
 
 
248
  sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
249
  date_val = headers.get("Date", "") or date_val
250
 
251
+ # Clean body
252
  body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
253
  body_clean = URL_RE.sub(" URL ", body_clean)
254
  body_clean = re.sub(r"\s+", " ", body_clean).strip()
 
330
  return df
331
  analyzer = SentimentIntensityAnalyzer()
332
  scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
 
333
  # VADER thresholds: [-1,-0.05), (-0.05,0.05), (0.05,1]
334
  bins = [-1.01, -0.05, 0.05, 1.01]
335
  labels = ["negative", "neutral", "positive"]
336
+ df["sentiment_score"] = scores
337
  df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
338
  return df
339
 
340
  def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str] = None) -> str:
341
  """Email reader HTML with highlighted query terms and visible tags."""
342
  subject = (row.get("subject") or "").strip()
343
+ body = (row.get("body_text") or "").strip()
344
  from_email = row.get("from_email") or ""
345
+ date = row.get("date") or ""
346
+ tags = row.get("tags") or []
347
  sentiment = row.get("sentiment") or "(unknown)"
348
 
349
  def hi(text: str) -> str:
 
361
  return out
362
 
363
  subject_h = hi(subject)
364
+ body_h = hi(body)
365
 
366
  # Basic RTL detection for Hebrew/Arabic chars → add dir="rtl"
367
  rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
368
  dir_attr = ' dir="rtl"' if rtl else ""
 
369
  body_html = body_h.replace("\n", "<br/>")
370
 
371
  tag_html = ""
 
399
  out = {}
400
  uniq = np.unique(labels)
401
  for c in uniq:
402
+ mask = (labels == c)
403
  if mask.sum() == 0:
404
  out[int(c)] = f"cluster_{c}"
405
  continue
 
408
  if mean_vec.size == 0:
409
  out[int(c)] = f"cluster_{c}"
410
  continue
411
+ # oversample candidates, then filter junk
412
+ take = max(topn * 4, topn)
413
+ idx = np.argpartition(mean_vec, -take)[-take:]
414
+ terms = _sanitize_top_terms(names, idx, mean_vec, want=topn)
415
  out[int(c)] = ", ".join(terms) if terms else f"cluster_{c}"
416
  return out
417
 
 
421
 
422
  # =================== Gradio UI ===================
423
  CSS = """
424
+ :root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
425
+ .email-card { background:#ffffff; color:#111827; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.08); }
426
  .email-header { display:flex; align-items:flex-start; justify-content:space-between; gap:12px; }
427
+ .subject { color:#0f172a; font-size:18px; font-weight:700; margin-bottom:6px; }
428
+ .meta { color:#334155; font-size:12px; }
429
  .badges { display:flex; gap:8px; align-items:center; flex-wrap:wrap; }
430
  .cluster-pill { background:var(--pill); color:var(--pill-text); padding:2px 8px; border-radius:999px; font-size:12px; }
431
+ .sentiment { font-size:12px; color:#334155; }
432
  .tag { background:var(--tag); color:var(--tag-text); padding:2px 6px; border-radius:6px; font-size:12px; }
433
+ .email-body { margin-top:12px; max-height:520px; overflow:auto; line-height:1.6; white-space:normal; color:#111827; }
434
+ .email-body a { color:#1d4ed8; text-decoration:underline; }
435
+ mark { background:#fff59d; color:#111827; padding:0 2px; border-radius:2px; }
436
  hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
437
+ .small { color:#475569; font-size:12px; }
438
  """
439
 
440
  with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
 
475
  )
476
  with gr.Row():
477
  date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
478
+ date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
479
 
480
  with gr.Row():
481
  run_btn = gr.Button("Process", variant="primary")
 
484
 
485
  with gr.Row():
486
  cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
487
+ domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
488
 
489
  gr.Markdown("### Search")
490
  with gr.Row():
 
494
  email_view = gr.HTML(label="Reader")
495
 
496
  # State
497
+ state_df = gr.State() # full dataframe
498
+ state_vec = gr.State() # TfidfVectorizer
499
+ state_X_reduced = gr.State() # np.ndarray (LSA normalized) or None
500
+ state_index = gr.State() # Faiss index or sklearn NN
501
+ state_term_names = gr.State() # dict cluster_id -> label
502
+ state_query_terms = gr.State() # last search terms list
503
+ state_use_lsa = gr.State()
504
+ state_use_faiss = gr.State()
505
+ state_svd = gr.State()
506
+ state_norm = gr.State()
507
 
508
  # -------- IO helpers --------
509
  def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
 
712
 
713
  # Use gr.update to set dropdown choices + default values safely
714
  cluster_update = gr.update(choices=cluster_choices, value="(any)")
715
+ domain_update = gr.update(choices=domain_choices, value="(any)")
716
 
717
  return (
718
  status_md,
 
772
  def _tokenize_query(q: str) -> List[str]:
773
  if not q:
774
  return []
775
+ # split on spaces, keep simple tokens; dedupe while preserving order
776
  parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
 
777
  seen, out = set(), []
778
  for p in parts:
779
  if p.lower() not in seen:
 
839
  # Get identifying columns from the table row to map back to original df row
840
  sel = table.iloc[row_idx]
841
  subj = sel.get("subject", None)
842
+ frm = sel.get("from_email", None)
843
  dstr = sel.get("date", None)
844
  # match in original df
845
  cand = df