Update app.py
Browse files
app.py
CHANGED
|
@@ -41,7 +41,7 @@ except Exception:
|
|
| 41 |
# Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
|
| 42 |
TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
|
| 43 |
|
| 44 |
-
# URLs -> "URL" (reduce feature bloat).
|
| 45 |
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
|
| 46 |
|
| 47 |
# Quote lines ("> ...")
|
|
@@ -56,8 +56,8 @@ HEBREW_SENT_FROM_RE = re.compile(r"\nנשלח מה.*$", re.M)
|
|
| 56 |
|
| 57 |
# Forward/quoted markers
|
| 58 |
FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
|
| 59 |
-
FWD_MSG_RE
|
| 60 |
-
ON_WROTE_RE
|
| 61 |
|
| 62 |
# Toggle for language detection (skip for speed)
|
| 63 |
SKIP_LANGDETECT = True
|
|
@@ -71,6 +71,66 @@ SUSPECT_PHRASES = [
|
|
| 71 |
"contract splitting", "grease payment", "unreported", "unrecorded",
|
| 72 |
]
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
# =================== HTML/Text Cleanup ===================
|
| 75 |
def html_to_text(html: str) -> str:
|
| 76 |
if not html:
|
|
@@ -86,16 +146,13 @@ def strip_quotes_and_sigs(text: str) -> str:
|
|
| 86 |
return ""
|
| 87 |
# remove > quoted lines
|
| 88 |
text = QUOTE_LINE_RE.sub("", text)
|
| 89 |
-
|
| 90 |
# cut everything after signature separator
|
| 91 |
parts = SIG_RE.split(text)
|
| 92 |
if parts:
|
| 93 |
text = parts[0]
|
| 94 |
-
|
| 95 |
# remove device footers
|
| 96 |
text = SENT_FROM_RE.sub("", text)
|
| 97 |
text = HEBREW_SENT_FROM_RE.sub("", text)
|
| 98 |
-
|
| 99 |
# trim forwarded/quoted chains
|
| 100 |
cut = None
|
| 101 |
for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
|
|
@@ -105,7 +162,6 @@ def strip_quotes_and_sigs(text: str) -> str:
|
|
| 105 |
cut = idx if (cut is None or idx < cut) else cut
|
| 106 |
if cut is not None:
|
| 107 |
text = text[:cut]
|
| 108 |
-
|
| 109 |
return text.strip()
|
| 110 |
|
| 111 |
def parse_name_email(s: str) -> Tuple[str, str]:
|
|
@@ -176,7 +232,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
|
|
| 176 |
return {}
|
| 177 |
|
| 178 |
body_text_raw = raw.get("body_text") or raw.get("text") or ""
|
| 179 |
-
html_content
|
| 180 |
if html_content and not body_text_raw:
|
| 181 |
body_text_raw = html_to_text(html_content)
|
| 182 |
|
|
@@ -192,7 +248,7 @@ def normalize_email_record(raw: Dict[str, Any], use_langdetect: bool) -> Dict[st
|
|
| 192 |
sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
|
| 193 |
date_val = headers.get("Date", "") or date_val
|
| 194 |
|
| 195 |
-
# Clean body
|
| 196 |
body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
|
| 197 |
body_clean = URL_RE.sub(" URL ", body_clean)
|
| 198 |
body_clean = re.sub(r"\s+", " ", body_clean).strip()
|
|
@@ -274,20 +330,20 @@ def compute_sentiment_column(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 274 |
return df
|
| 275 |
analyzer = SentimentIntensityAnalyzer()
|
| 276 |
scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
|
| 277 |
-
df["sentiment_score"] = scores
|
| 278 |
# VADER thresholds: [-1,-0.05), (-0.05,0.05), (0.05,1]
|
| 279 |
bins = [-1.01, -0.05, 0.05, 1.01]
|
| 280 |
labels = ["negative", "neutral", "positive"]
|
|
|
|
| 281 |
df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
|
| 282 |
return df
|
| 283 |
|
| 284 |
def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str] = None) -> str:
|
| 285 |
"""Email reader HTML with highlighted query terms and visible tags."""
|
| 286 |
subject = (row.get("subject") or "").strip()
|
| 287 |
-
body
|
| 288 |
from_email = row.get("from_email") or ""
|
| 289 |
-
date
|
| 290 |
-
tags
|
| 291 |
sentiment = row.get("sentiment") or "(unknown)"
|
| 292 |
|
| 293 |
def hi(text: str) -> str:
|
|
@@ -305,12 +361,11 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
|
|
| 305 |
return out
|
| 306 |
|
| 307 |
subject_h = hi(subject)
|
| 308 |
-
body_h
|
| 309 |
|
| 310 |
# Basic RTL detection for Hebrew/Arabic chars → add dir="rtl"
|
| 311 |
rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
|
| 312 |
dir_attr = ' dir="rtl"' if rtl else ""
|
| 313 |
-
|
| 314 |
body_html = body_h.replace("\n", "<br/>")
|
| 315 |
|
| 316 |
tag_html = ""
|
|
@@ -344,7 +399,7 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
|
|
| 344 |
out = {}
|
| 345 |
uniq = np.unique(labels)
|
| 346 |
for c in uniq:
|
| 347 |
-
mask = labels == c
|
| 348 |
if mask.sum() == 0:
|
| 349 |
out[int(c)] = f"cluster_{c}"
|
| 350 |
continue
|
|
@@ -353,9 +408,10 @@ def top_terms_per_cluster(X, labels, vectorizer, topn=6):
|
|
| 353 |
if mean_vec.size == 0:
|
| 354 |
out[int(c)] = f"cluster_{c}"
|
| 355 |
continue
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
|
|
|
| 359 |
out[int(c)] = ", ".join(terms) if terms else f"cluster_{c}"
|
| 360 |
return out
|
| 361 |
|
|
@@ -365,18 +421,20 @@ def auto_k_rule(n_docs: int) -> int:
|
|
| 365 |
|
| 366 |
# =================== Gradio UI ===================
|
| 367 |
CSS = """
|
| 368 |
-
:root { --pill:#eef2ff; --pill-text:#
|
| 369 |
-
.email-card { background:#
|
| 370 |
.email-header { display:flex; align-items:flex-start; justify-content:space-between; gap:12px; }
|
| 371 |
-
.subject { font-size:18px; font-weight:700; margin-bottom:6px; }
|
| 372 |
-
.meta { color:#
|
| 373 |
.badges { display:flex; gap:8px; align-items:center; flex-wrap:wrap; }
|
| 374 |
.cluster-pill { background:var(--pill); color:var(--pill-text); padding:2px 8px; border-radius:999px; font-size:12px; }
|
| 375 |
-
.sentiment { font-size:12px; color:#
|
| 376 |
.tag { background:var(--tag); color:var(--tag-text); padding:2px 6px; border-radius:6px; font-size:12px; }
|
| 377 |
-
.email-body { margin-top:12px; max-height:520px; overflow:auto; line-height:1.
|
|
|
|
|
|
|
| 378 |
hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
|
| 379 |
-
.small { color:#
|
| 380 |
"""
|
| 381 |
|
| 382 |
with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
|
|
@@ -417,7 +475,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 417 |
)
|
| 418 |
with gr.Row():
|
| 419 |
date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
|
| 420 |
-
date_end
|
| 421 |
|
| 422 |
with gr.Row():
|
| 423 |
run_btn = gr.Button("Process", variant="primary")
|
|
@@ -426,7 +484,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 426 |
|
| 427 |
with gr.Row():
|
| 428 |
cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
|
| 429 |
-
domain_counts_df
|
| 430 |
|
| 431 |
gr.Markdown("### Search")
|
| 432 |
with gr.Row():
|
|
@@ -436,16 +494,16 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 436 |
email_view = gr.HTML(label="Reader")
|
| 437 |
|
| 438 |
# State
|
| 439 |
-
state_df
|
| 440 |
-
state_vec
|
| 441 |
-
state_X_reduced
|
| 442 |
-
state_index
|
| 443 |
-
state_term_names
|
| 444 |
-
state_query_terms = gr.State()
|
| 445 |
-
state_use_lsa
|
| 446 |
-
state_use_faiss
|
| 447 |
-
state_svd
|
| 448 |
-
state_norm
|
| 449 |
|
| 450 |
# -------- IO helpers --------
|
| 451 |
def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
|
|
@@ -654,7 +712,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 654 |
|
| 655 |
# Use gr.update to set dropdown choices + default values safely
|
| 656 |
cluster_update = gr.update(choices=cluster_choices, value="(any)")
|
| 657 |
-
domain_update
|
| 658 |
|
| 659 |
return (
|
| 660 |
status_md,
|
|
@@ -714,9 +772,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 714 |
def _tokenize_query(q: str) -> List[str]:
|
| 715 |
if not q:
|
| 716 |
return []
|
| 717 |
-
# split on spaces, keep simple tokens;
|
| 718 |
parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
|
| 719 |
-
# dedupe while preserving order
|
| 720 |
seen, out = set(), []
|
| 721 |
for p in parts:
|
| 722 |
if p.lower() not in seen:
|
|
@@ -782,7 +839,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 782 |
# Get identifying columns from the table row to map back to original df row
|
| 783 |
sel = table.iloc[row_idx]
|
| 784 |
subj = sel.get("subject", None)
|
| 785 |
-
frm
|
| 786 |
dstr = sel.get("date", None)
|
| 787 |
# match in original df
|
| 788 |
cand = df
|
|
|
|
| 41 |
# Keep emails/domains in tokens; \w is unicode-aware (Hebrew included)
|
| 42 |
TOKEN_PATTERN = r"(?u)\b\w[\w.@-]{1,}\b"
|
| 43 |
|
| 44 |
+
# URLs -> "URL" (reduce feature bloat).
|
| 45 |
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.I)
|
| 46 |
|
| 47 |
# Quote lines ("> ...")
|
|
|
|
| 56 |
|
| 57 |
# Forward/quoted markers
|
| 58 |
FWD_BEGIN_RE = re.compile(r"^Begin forwarded message:", re.I | re.M)
|
| 59 |
+
FWD_MSG_RE = re.compile(r"^[-\s]*Original Message[-\s]*$", re.I | re.M)
|
| 60 |
+
ON_WROTE_RE = re.compile(r'^\s*On .* wrote:$', re.M)
|
| 61 |
|
| 62 |
# Toggle for language detection (skip for speed)
|
| 63 |
SKIP_LANGDETECT = True
|
|
|
|
| 71 |
"contract splitting", "grease payment", "unreported", "unrecorded",
|
| 72 |
]
|
| 73 |
|
| 74 |
+
# =================== Label cleanup helpers ===================
|
| 75 |
+
EN_STOP = {
|
| 76 |
+
"the","of","and","to","in","is","for","on","at","with","from","by","or","as",
|
| 77 |
+
"that","this","it","be","are","was","were","an","a","you","your","we","our","us",
|
| 78 |
+
"re","fwd","fw","hi","hello","thanks","thank","regards","best","please","dear","mr","mrs",
|
| 79 |
+
"message","original","forwarded","attached","attachment","confidential","notice","disclaimer",
|
| 80 |
+
"herein","thereof","hereby","therein","regarding","subject","url","via","kind","regard",
|
| 81 |
+
"ny" # short common noise in your set
|
| 82 |
+
}
|
| 83 |
+
HE_STOP = {
|
| 84 |
+
"של","על","זה","גם","אם","לא","את","אתה","אני","הוא","היא","הם","הן","כי","מה",
|
| 85 |
+
"שלום","תודה","בברכה","מצורף","הודעה","קדימה","היי"
|
| 86 |
+
}
|
| 87 |
+
MONTHS = {
|
| 88 |
+
"jan","feb","mar","apr","may","jun","jul","aug","sep","sept","oct","nov","dec",
|
| 89 |
+
"january","february","march","april","june","july","august","september",
|
| 90 |
+
"october","november","december"
|
| 91 |
+
}
|
| 92 |
+
EMAIL_LIKE_RE = re.compile(r"@|^[\w\-]+\.(com|net|org|ru|us|il|ch|co|io|uk|de|fr|it)$", re.I)
|
| 93 |
+
YEAR_RE = re.compile(r"^(19|20)\d{2}$")
|
| 94 |
+
NUMERIC_RE = re.compile(r"^\d+([.,:/-]\d+)*$")
|
| 95 |
+
ONE_CHAR_RE = re.compile(r"^.$")
|
| 96 |
+
|
| 97 |
+
def _is_junk_term(t: str) -> bool:
|
| 98 |
+
tl = t.lower()
|
| 99 |
+
if tl in EN_STOP or tl in HE_STOP or tl in MONTHS:
|
| 100 |
+
return True
|
| 101 |
+
if EMAIL_LIKE_RE.search(tl):
|
| 102 |
+
return True
|
| 103 |
+
if YEAR_RE.match(tl):
|
| 104 |
+
return True
|
| 105 |
+
if NUMERIC_RE.match(tl):
|
| 106 |
+
return True
|
| 107 |
+
if ONE_CHAR_RE.match(tl):
|
| 108 |
+
return True
|
| 109 |
+
return False
|
| 110 |
+
|
| 111 |
+
def _sanitize_top_terms(names: np.ndarray, idxs: np.ndarray, mean_vec: np.ndarray, want:int) -> list:
|
| 112 |
+
# Keep order by descending weight in idxs
|
| 113 |
+
ordered = idxs[np.argsort(-mean_vec[idxs])]
|
| 114 |
+
cleaned = []
|
| 115 |
+
for i in ordered:
|
| 116 |
+
term = names[i]
|
| 117 |
+
if _is_junk_term(term):
|
| 118 |
+
continue
|
| 119 |
+
cleaned.append(term)
|
| 120 |
+
if len(cleaned) >= want:
|
| 121 |
+
break
|
| 122 |
+
# If we filtered too hard, allow some not-too-bad tokens (but still avoid email-like)
|
| 123 |
+
if len(cleaned) < max(2, want//2):
|
| 124 |
+
for i in ordered:
|
| 125 |
+
term = names[i]
|
| 126 |
+
if EMAIL_LIKE_RE.search(term) or YEAR_RE.match(term.lower()):
|
| 127 |
+
continue
|
| 128 |
+
if term not in cleaned:
|
| 129 |
+
cleaned.append(term)
|
| 130 |
+
if len(cleaned) >= want:
|
| 131 |
+
break
|
| 132 |
+
return cleaned
|
| 133 |
+
|
| 134 |
# =================== HTML/Text Cleanup ===================
|
| 135 |
def html_to_text(html: str) -> str:
|
| 136 |
if not html:
|
|
|
|
| 146 |
return ""
|
| 147 |
# remove > quoted lines
|
| 148 |
text = QUOTE_LINE_RE.sub("", text)
|
|
|
|
| 149 |
# cut everything after signature separator
|
| 150 |
parts = SIG_RE.split(text)
|
| 151 |
if parts:
|
| 152 |
text = parts[0]
|
|
|
|
| 153 |
# remove device footers
|
| 154 |
text = SENT_FROM_RE.sub("", text)
|
| 155 |
text = HEBREW_SENT_FROM_RE.sub("", text)
|
|
|
|
| 156 |
# trim forwarded/quoted chains
|
| 157 |
cut = None
|
| 158 |
for pat in (FWD_BEGIN_RE, FWD_MSG_RE, ON_WROTE_RE):
|
|
|
|
| 162 |
cut = idx if (cut is None or idx < cut) else cut
|
| 163 |
if cut is not None:
|
| 164 |
text = text[:cut]
|
|
|
|
| 165 |
return text.strip()
|
| 166 |
|
| 167 |
def parse_name_email(s: str) -> Tuple[str, str]:
|
|
|
|
| 232 |
return {}
|
| 233 |
|
| 234 |
body_text_raw = raw.get("body_text") or raw.get("text") or ""
|
| 235 |
+
html_content = raw.get("body_html") or raw.get("html") or ""
|
| 236 |
if html_content and not body_text_raw:
|
| 237 |
body_text_raw = html_to_text(html_content)
|
| 238 |
|
|
|
|
| 248 |
sender = headers.get("From", "") or raw.get("from") or raw.get("From") or ""
|
| 249 |
date_val = headers.get("Date", "") or date_val
|
| 250 |
|
| 251 |
+
# Clean body
|
| 252 |
body_clean = strip_quotes_and_sigs(ftfy.fix_text(body_only or ""))
|
| 253 |
body_clean = URL_RE.sub(" URL ", body_clean)
|
| 254 |
body_clean = re.sub(r"\s+", " ", body_clean).strip()
|
|
|
|
| 330 |
return df
|
| 331 |
analyzer = SentimentIntensityAnalyzer()
|
| 332 |
scores = df["body_text"].fillna("").map(lambda t: analyzer.polarity_scores(t)["compound"])
|
|
|
|
| 333 |
# VADER thresholds: [-1,-0.05), (-0.05,0.05), (0.05,1]
|
| 334 |
bins = [-1.01, -0.05, 0.05, 1.01]
|
| 335 |
labels = ["negative", "neutral", "positive"]
|
| 336 |
+
df["sentiment_score"] = scores
|
| 337 |
df["sentiment"] = pd.cut(df["sentiment_score"], bins=bins, labels=labels, include_lowest=True)
|
| 338 |
return df
|
| 339 |
|
| 340 |
def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = None, cluster_label: Optional[str] = None) -> str:
|
| 341 |
"""Email reader HTML with highlighted query terms and visible tags."""
|
| 342 |
subject = (row.get("subject") or "").strip()
|
| 343 |
+
body = (row.get("body_text") or "").strip()
|
| 344 |
from_email = row.get("from_email") or ""
|
| 345 |
+
date = row.get("date") or ""
|
| 346 |
+
tags = row.get("tags") or []
|
| 347 |
sentiment = row.get("sentiment") or "(unknown)"
|
| 348 |
|
| 349 |
def hi(text: str) -> str:
|
|
|
|
| 361 |
return out
|
| 362 |
|
| 363 |
subject_h = hi(subject)
|
| 364 |
+
body_h = hi(body)
|
| 365 |
|
| 366 |
# Basic RTL detection for Hebrew/Arabic chars → add dir="rtl"
|
| 367 |
rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
|
| 368 |
dir_attr = ' dir="rtl"' if rtl else ""
|
|
|
|
| 369 |
body_html = body_h.replace("\n", "<br/>")
|
| 370 |
|
| 371 |
tag_html = ""
|
|
|
|
| 399 |
out = {}
|
| 400 |
uniq = np.unique(labels)
|
| 401 |
for c in uniq:
|
| 402 |
+
mask = (labels == c)
|
| 403 |
if mask.sum() == 0:
|
| 404 |
out[int(c)] = f"cluster_{c}"
|
| 405 |
continue
|
|
|
|
| 408 |
if mean_vec.size == 0:
|
| 409 |
out[int(c)] = f"cluster_{c}"
|
| 410 |
continue
|
| 411 |
+
# oversample candidates, then filter junk
|
| 412 |
+
take = max(topn * 4, topn)
|
| 413 |
+
idx = np.argpartition(mean_vec, -take)[-take:]
|
| 414 |
+
terms = _sanitize_top_terms(names, idx, mean_vec, want=topn)
|
| 415 |
out[int(c)] = ", ".join(terms) if terms else f"cluster_{c}"
|
| 416 |
return out
|
| 417 |
|
|
|
|
| 421 |
|
| 422 |
# =================== Gradio UI ===================
|
| 423 |
CSS = """
|
| 424 |
+
:root { --pill:#eef2ff; --pill-text:#1f2937; --tag:#e5e7eb; --tag-text:#111827; }
|
| 425 |
+
.email-card { background:#ffffff; color:#111827; border-radius:12px; padding:16px; box-shadow:0 1px 3px rgba(0,0,0,0.08); }
|
| 426 |
.email-header { display:flex; align-items:flex-start; justify-content:space-between; gap:12px; }
|
| 427 |
+
.subject { color:#0f172a; font-size:18px; font-weight:700; margin-bottom:6px; }
|
| 428 |
+
.meta { color:#334155; font-size:12px; }
|
| 429 |
.badges { display:flex; gap:8px; align-items:center; flex-wrap:wrap; }
|
| 430 |
.cluster-pill { background:var(--pill); color:var(--pill-text); padding:2px 8px; border-radius:999px; font-size:12px; }
|
| 431 |
+
.sentiment { font-size:12px; color:#334155; }
|
| 432 |
.tag { background:var(--tag); color:var(--tag-text); padding:2px 6px; border-radius:6px; font-size:12px; }
|
| 433 |
+
.email-body { margin-top:12px; max-height:520px; overflow:auto; line-height:1.6; white-space:normal; color:#111827; }
|
| 434 |
+
.email-body a { color:#1d4ed8; text-decoration:underline; }
|
| 435 |
+
mark { background:#fff59d; color:#111827; padding:0 2px; border-radius:2px; }
|
| 436 |
hr.sep { border:none; border-top:1px solid #e5e7eb; margin:10px 0; }
|
| 437 |
+
.small { color:#475569; font-size:12px; }
|
| 438 |
"""
|
| 439 |
|
| 440 |
with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="soft") as demo:
|
|
|
|
| 475 |
)
|
| 476 |
with gr.Row():
|
| 477 |
date_start = gr.Textbox(label="Date from (YYYY-MM-DD, optional)", value="")
|
| 478 |
+
date_end = gr.Textbox(label="Date to (YYYY-MM-DD, optional)", value="")
|
| 479 |
|
| 480 |
with gr.Row():
|
| 481 |
run_btn = gr.Button("Process", variant="primary")
|
|
|
|
| 484 |
|
| 485 |
with gr.Row():
|
| 486 |
cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500)", interactive=False, wrap=True)
|
| 487 |
+
domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
|
| 488 |
|
| 489 |
gr.Markdown("### Search")
|
| 490 |
with gr.Row():
|
|
|
|
| 494 |
email_view = gr.HTML(label="Reader")
|
| 495 |
|
| 496 |
# State
|
| 497 |
+
state_df = gr.State() # full dataframe
|
| 498 |
+
state_vec = gr.State() # TfidfVectorizer
|
| 499 |
+
state_X_reduced = gr.State() # np.ndarray (LSA normalized) or None
|
| 500 |
+
state_index = gr.State() # Faiss index or sklearn NN
|
| 501 |
+
state_term_names = gr.State() # dict cluster_id -> label
|
| 502 |
+
state_query_terms = gr.State() # last search terms list
|
| 503 |
+
state_use_lsa = gr.State()
|
| 504 |
+
state_use_faiss = gr.State()
|
| 505 |
+
state_svd = gr.State()
|
| 506 |
+
state_norm = gr.State()
|
| 507 |
|
| 508 |
# -------- IO helpers --------
|
| 509 |
def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
|
|
|
|
| 712 |
|
| 713 |
# Use gr.update to set dropdown choices + default values safely
|
| 714 |
cluster_update = gr.update(choices=cluster_choices, value="(any)")
|
| 715 |
+
domain_update = gr.update(choices=domain_choices, value="(any)")
|
| 716 |
|
| 717 |
return (
|
| 718 |
status_md,
|
|
|
|
| 772 |
def _tokenize_query(q: str) -> List[str]:
|
| 773 |
if not q:
|
| 774 |
return []
|
| 775 |
+
# split on spaces, keep simple tokens; dedupe while preserving order
|
| 776 |
parts = [p.strip() for p in re.split(r"\s+", q) if p.strip()]
|
|
|
|
| 777 |
seen, out = set(), []
|
| 778 |
for p in parts:
|
| 779 |
if p.lower() not in seen:
|
|
|
|
| 839 |
# Get identifying columns from the table row to map back to original df row
|
| 840 |
sel = table.iloc[row_idx]
|
| 841 |
subj = sel.get("subject", None)
|
| 842 |
+
frm = sel.get("from_email", None)
|
| 843 |
dstr = sel.get("date", None)
|
| 844 |
# match in original df
|
| 845 |
cand = df
|