Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Aug 31, 2025

Commit

7a250cb

verified ·

1 Parent(s): c8241c4

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -42

app.py CHANGED Viewed

@@ -41,7 +41,7 @@ except Exception:
 # Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
 TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
-# URLs -> "URL" (reduce feature bloat). We DO NOT redact phone numbers per your request.
 URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
 # Quote lines ("> ...")
@@ -56,8 +56,8 @@ HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
 # Forward/quoted markers
 FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
-FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
-ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
 # Toggle for language detection (skip for speed)
 SKIP_LANGDETECT = True
@@ -71,6 +71,66 @@ SUSPECT_PHRASES = [
     "contract splitting", "grease payment", "unreported", "unrecorded",
 ]
 # =================== HTML/Text Cleanup ===================
 def html_to_text(html: str) -> str:
     if not html:
@@ -86,16 +146,13 @@ def strip_quotes_and_sigs(text: str) -> str:
         return ""
     # remove > quoted lines
     text = QUOTE_LINE_RE.sub("", text)
     # cut everything after signature separator
     parts = SIG_RE.split(text)
     if parts:
         text = parts[0]
     # remove device footers
     text = SENT_FROM_RE.sub("", text)
     text = HEBREW_SENT_FROM_RE.sub("", text)
     # trim forwarded/quoted chains
     cut = None
     for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
@@ -105,7 +162,6 @@ def strip_quotes_and_sigs(text: str) -> str:
             cut = idx if (cut is None or idx < cut) else cut
     if cut is not None:
         text = text[:cut]
     return text.strip()
 def parse_name_email(s: str) -> Tuple[str, str]:
@@ -176,7 +232,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
         return {}
     body_text_raw = raw.get("body_text") or raw.get("text") or ""
-    html_content = raw.get("body_html") or raw.get("html") or ""
     if html_content and not body_text_raw:
         body_text_raw = html_to_text(html_content)
@@ -192,7 +248,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
         sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
         date_val = headers.get("Date", "") or date_val
-        # Clean body: NO phone redaction, per your request
         body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
         body_clean = URL_RE.sub(" URL ", body_clean)
         body_clean = re.sub(r"\s+", " ", body_clean).strip()
@@ -274,20 +330,20 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
         return df
     analyzer = SentimentIntensityAnalyzer()
     scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
-    df["sentiment_score"] = scores
     # VADER thresholds: [-1,-0.05), (-0.05,0.05), (0.05,1]
     bins = [-1.01, -0.05, 0.05, 1.01]
     labels = ["negative", "neutral", "positive"]
     df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
     return df
 def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str] = None) -> str:
     """Email reader HTML with highlighted query terms and visible tags."""
     subject = (row.get("subject") or "").strip()
-    body = (row.get("body_text") or "").strip()
     from_email = row.get("from_email") or ""
-    date = row.get("date") or ""
-    tags = row.get("tags") or []
     sentiment = row.get("sentiment") or "(unknown)"
     def hi(text: str) -> str:
@@ -305,12 +361,11 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
         return out
     subject_h = hi(subject)
-    body_h = hi(body)
     # Basic RTL detection for Hebrew/Arabic chars → add dir="rtl"
     rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
     dir_attr = ' dir="rtl"' if rtl else ""
     body_html = body_h.replace("\n", "<br/>")
     tag_html = ""
@@ -344,7 +399,7 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
     out = {}
     uniq = np.unique(labels)
     for c in uniq:
-        mask = labels == c
         if mask.sum() == 0:
             out[int(c)] = f"cluster_{c}"
             continue
@@ -353,9 +408,10 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
         if mean_vec.size == 0:
             out[int(c)] = f"cluster_{c}"
             continue
-        idx = np.argpartition(mean_vec, -topn)[-topn:]
-        idx = idx[np.argsort(-mean_vec[idx])]
-        terms = [names[i] for i in idx if mean_vec[i] > 0]
         out[int(c)] = ", ".join(terms) if terms else f"cluster_{c}"
     return out
@@ -365,18 +421,20 @@ def auto_k_rule(n_docs: int) -> int:
 # =================== Gradio UI ===================
 CSS = """
-:root { --pill:#eef2ff; --pill-text:#3730a3; --tag:#eee; --tag-text:#444;}
-.email-card { background:#fff; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.06); }
 .email-header { display:flex; align-items:flex-start; justify-content:space-between; gap:12px; }
-.subject { font-size:18px; font-weight:700; margin-bottom:6px; }
-.meta { color:#666; font-size:12px; }
 .badges { display:flex; gap:8px; align-items:center; flex-wrap:wrap; }
 .cluster-pill { background:var(--pill); color:var(--pill-text); padding:2px 8px; border-radius:999px; font-size:12px; }
-.sentiment { font-size:12px; color:#555; }
 .tag { background:var(--tag); color:var(--tag-text); padding:2px 6px; border-radius:6px; font-size:12px; }
-.email-body { margin-top:12px; max-height:520px; overflow:auto; line-height:1.5; white-space:normal; }
 hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
-.small { color:#666; font-size:12px; }
 """
 with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
@@ -417,7 +475,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             )
         with gr.Row():
             date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
-            date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
     with gr.Row():
         run_btn = gr.Button("Process", variant="primary")
@@ -426,7 +484,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     with gr.Row():
         cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
-        domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
     gr.Markdown("### Search")
     with gr.Row():
@@ -436,16 +494,16 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     email_view = gr.HTML(label="Reader")
     # State
-    state_df = gr.State()          # full dataframe
-    state_vec = gr.State()         # TfidfVectorizer
-    state_X_reduced = gr.State()   # np.ndarray (LSA normalized) or None
-    state_index = gr.State()       # Faiss index or sklearn NN
-    state_term_names = gr.State()  # dict cluster_id -> label
-    state_query_terms = gr.State() # last search terms list
-    state_use_lsa = gr.State()
-    state_use_faiss = gr.State()
-    state_svd = gr.State()
-    state_norm = gr.State()
     # -------- IO helpers --------
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
@@ -654,7 +712,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         # Use gr.update to set dropdown choices + default values safely
         cluster_update = gr.update(choices=cluster_choices, value="(any)")
-        domain_update = gr.update(choices=domain_choices, value="(any)")
         return (
             status_md,
@@ -714,9 +772,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     def _tokenize_query(q: str) -> List[str]:
         if not q:
             return []
-        # split on spaces, keep simple tokens; short stop words aren’t filtered to keep behavior explicit
         parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
-        # dedupe while preserving order
         seen, out = set(), []
         for p in parts:
             if p.lower() not in seen:
@@ -782,7 +839,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         # Get identifying columns from the table row to map back to original df row
         sel = table.iloc[row_idx]
         subj = sel.get("subject", None)
-        frm = sel.get("from_email", None)
         dstr = sel.get("date", None)
         # match in original df
         cand = df

 # Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
 TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
+# URLs -> "URL" (reduce feature bloat).
 URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
 # Quote lines ("> ...")
 # Forward/quoted markers
 FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
+FWD_MSG_RE   = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
+ON_WROTE_RE  = re.compile(r'^\s*On .* wrote:$', re.M)
 # Toggle for language detection (skip for speed)
 SKIP_LANGDETECT = True
     "contract splitting", "grease payment", "unreported", "unrecorded",
 ]
+# =================== Label cleanup helpers ===================
+EN_STOP = {
+    "the","of","and","to","in","is","for","on","at","with","from","by","or","as",
+    "that","this","it","be","are","was","were","an","a","you","your","we","our","us",
+    "re","fwd","fw","hi","hello","thanks","thank","regards","best","please","dear","mr","mrs",
+    "message","original","forwarded","attached","attachment","confidential","notice","disclaimer",
+    "herein","thereof","hereby","therein","regarding","subject","url","via","kind","regard",
+    "ny"  # short common noise in your set
+}
+HE_STOP = {
+    "של","על","זה","גם","אם","לא","את","אתה","אני","הוא","היא","הם","הן","כי","מה",
+    "שלום","תודה","בברכה","מצורף","הודעה","קדימה","היי"
+}
+MONTHS = {
+    "jan","feb","mar","apr","may","jun","jul","aug","sep","sept","oct","nov","dec",
+    "january","february","march","april","june","july","august","september",
+    "october","november","december"
+}
+EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
+YEAR_RE = re.compile(r"^(19|20)\d{2}$")
+NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
+ONE_CHAR_RE = re.compile(r"^.$")
+def _is_junk_term(t: str) -> bool:
+    tl = t.lower()
+    if tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
+        return True
+    if EMAIL_LIKE_RE.search(tl):
+        return True
+    if YEAR_RE.match(tl):
+        return True
+    if NUMERIC_RE.match(tl):
+        return True
+    if ONE_CHAR_RE.match(tl):
+        return True
+    return False
+def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
+    # Keep order by descending weight in idxs
+    ordered = idxs[np.argsort(-mean_vec[idxs])]
+    cleaned = []
+    for i in ordered:
+        term = names[i]
+        if _is_junk_term(term):
+            continue
+        cleaned.append(term)
+        if len(cleaned) >= want:
+            break
+    # If we filtered too hard, allow some not-too-bad tokens (but still avoid email-like)
+    if len(cleaned) < max(2, want//2):
+        for i in ordered:
+            term = names[i]
+            if EMAIL_LIKE_RE.search(term) or YEAR_RE.match(term.lower()):
+                continue
+            if term not in cleaned:
+                cleaned.append(term)
+            if len(cleaned) >= want:
+                break
+    return cleaned
 # =================== HTML/Text Cleanup ===================
 def html_to_text(html: str) -> str:
     if not html:
         return ""
     # remove > quoted lines
     text = QUOTE_LINE_RE.sub("", text)
     # cut everything after signature separator
     parts = SIG_RE.split(text)
     if parts:
         text = parts[0]
     # remove device footers
     text = SENT_FROM_RE.sub("", text)
     text = HEBREW_SENT_FROM_RE.sub("", text)
     # trim forwarded/quoted chains
     cut = None
     for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
             cut = idx if (cut is None or idx < cut) else cut
     if cut is not None:
         text = text[:cut]
     return text.strip()
 def parse_name_email(s: str) -> Tuple[str, str]:
         return {}
     body_text_raw = raw.get("body_text") or raw.get("text") or ""
+    html_content  = raw.get("body_html") or raw.get("html") or ""
     if html_content and not body_text_raw:
         body_text_raw = html_to_text(html_content)
         sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
         date_val = headers.get("Date", "") or date_val
+        # Clean body
         body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
         body_clean = URL_RE.sub(" URL ", body_clean)
         body_clean = re.sub(r"\s+", " ", body_clean).strip()
         return df
     analyzer = SentimentIntensityAnalyzer()
     scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
     # VADER thresholds: [-1,-0.05), (-0.05,0.05), (0.05,1]
     bins = [-1.01, -0.05, 0.05, 1.01]
     labels = ["negative", "neutral", "positive"]
+    df["sentiment_score"] = scores
     df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
     return df
 def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str] = None) -> str:
     """Email reader HTML with highlighted query terms and visible tags."""
     subject = (row.get("subject") or "").strip()
+    body    = (row.get("body_text") or "").strip()
     from_email = row.get("from_email") or ""
+    date    = row.get("date") or ""
+    tags    = row.get("tags") or []
     sentiment = row.get("sentiment") or "(unknown)"
     def hi(text: str) -> str:
         return out
     subject_h = hi(subject)
+    body_h    = hi(body)
     # Basic RTL detection for Hebrew/Arabic chars → add dir="rtl"
     rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
     dir_attr = ' dir="rtl"' if rtl else ""
     body_html = body_h.replace("\n", "<br/>")
     tag_html = ""
     out = {}
     uniq = np.unique(labels)
     for c in uniq:
+        mask = (labels == c)
         if mask.sum() == 0:
             out[int(c)] = f"cluster_{c}"
             continue
         if mean_vec.size == 0:
             out[int(c)] = f"cluster_{c}"
             continue
+        # oversample candidates, then filter junk
+        take = max(topn * 4, topn)
+        idx = np.argpartition(mean_vec, -take)[-take:]
+        terms = _sanitize_top_terms(names, idx, mean_vec, want=topn)
         out[int(c)] = ", ".join(terms) if terms else f"cluster_{c}"
     return out
 # =================== Gradio UI ===================
 CSS = """
+:root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
+.email-card { background:#ffffff; color:#111827; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.08); }
 .email-header { display:flex; align-items:flex-start; justify-content:space-between; gap:12px; }
+.subject { color:#0f172a; font-size:18px; font-weight:700; margin-bottom:6px; }
+.meta { color:#334155; font-size:12px; }
 .badges { display:flex; gap:8px; align-items:center; flex-wrap:wrap; }
 .cluster-pill { background:var(--pill); color:var(--pill-text); padding:2px 8px; border-radius:999px; font-size:12px; }
+.sentiment { font-size:12px; color:#334155; }
 .tag { background:var(--tag); color:var(--tag-text); padding:2px 6px; border-radius:6px; font-size:12px; }
+.email-body { margin-top:12px; max-height:520px; overflow:auto; line-height:1.6; white-space:normal; color:#111827; }
+.email-body a { color:#1d4ed8; text-decoration:underline; }
+mark { background:#fff59d; color:#111827; padding:0 2px; border-radius:2px; }
 hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
+.small { color:#475569; font-size:12px; }
 """
 with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
             )
         with gr.Row():
             date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
+            date_end   = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
     with gr.Row():
         run_btn = gr.Button("Process", variant="primary")
     with gr.Row():
         cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
+        domain_counts_df  = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
     gr.Markdown("### Search")
     with gr.Row():
     email_view = gr.HTML(label="Reader")
     # State
+    state_df          = gr.State()          # full dataframe
+    state_vec         = gr.State()          # TfidfVectorizer
+    state_X_reduced   = gr.State()          # np.ndarray (LSA normalized) or None
+    state_index       = gr.State()          # Faiss index or sklearn NN
+    state_term_names  = gr.State()          # dict cluster_id -> label
+    state_query_terms = gr.State()          # last search terms list
+    state_use_lsa     = gr.State()
+    state_use_faiss   = gr.State()
+    state_svd         = gr.State()
+    state_norm        = gr.State()
     # -------- IO helpers --------
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
         # Use gr.update to set dropdown choices + default values safely
         cluster_update = gr.update(choices=cluster_choices, value="(any)")
+        domain_update  = gr.update(choices=domain_choices,  value="(any)")
         return (
             status_md,
     def _tokenize_query(q: str) -> List[str]:
         if not q:
             return []
+        # split on spaces, keep simple tokens; dedupe while preserving order
         parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
         seen, out = set(), []
         for p in parts:
             if p.lower() not in seen:
         # Get identifying columns from the table row to map back to original df row
         sel = table.iloc[row_idx]
         subj = sel.get("subject", None)
+        frm  = sel.get("from_email", None)
         dstr = sel.get("date", None)
         # match in original df
         cand = df