Update app.py
Browse files
app.py
CHANGED
|
@@ -308,30 +308,33 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
|
|
| 308 |
rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
|
| 309 |
dir_attr = ' dir="rtl"' if rtl else ""
|
| 310 |
|
|
|
|
|
|
|
|
|
|
| 311 |
tag_html = ""
|
| 312 |
if isinstance(tags, list) and tags:
|
| 313 |
tag_html = " ".join([f'<span class="tag">{t}</span>' for t in tags])
|
| 314 |
|
| 315 |
cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
|
| 316 |
|
| 317 |
-
html =
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
<div>
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
</div>
|
| 324 |
-
<div class="badges">
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
</div>
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
return html
|
| 336 |
|
| 337 |
def top_terms_per_cluster(X, labels, vectorizer, topn=6):
|
|
@@ -713,14 +716,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 713 |
# Vectorize the query
|
| 714 |
q_vec = vec.transform([q])
|
| 715 |
if use_lsa_flag and X_reduced is not None:
|
| 716 |
-
#
|
| 717 |
-
#
|
| 718 |
-
|
| 719 |
-
if use_faiss_flag and isinstance(index_obj, faiss.IndexFlatIP):
|
| 720 |
-
# We need the same SVD+Normalizer to project q; since we didn’t persist them,
|
| 721 |
-
# fallback gracefully to TF-IDF brute-force nearest neighbors.
|
| 722 |
-
# So here: if Faiss present but we can't project q, we simply fallback below.
|
| 723 |
-
pass
|
| 724 |
|
| 725 |
# If we have a sklearn NearestNeighbors (cosine brute-force)
|
| 726 |
if isinstance(index_obj, NearestNeighbors):
|
|
@@ -730,17 +728,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 730 |
results = df.iloc[inds].copy()
|
| 731 |
results["score"] = sims
|
| 732 |
elif FAISS_OK and isinstance(index_obj, faiss.Index):
|
| 733 |
-
#
|
| 734 |
-
#
|
| 735 |
-
nn = NearestNeighbors(metric="cosine", algorithm="brute")
|
| 736 |
-
nn.fit(q_vec.__class__(q_vec)) # no-op to appease types
|
| 737 |
-
# build a temporary NN on the corpus TF-IDF
|
| 738 |
-
nn = NearestNeighbors(metric="cosine", algorithm="brute")
|
| 739 |
-
# Fit once per search is heavy; instead, do manual cosine on sparse matrix:
|
| 740 |
-
# Efficient manual sparse cosine for 1 query:
|
| 741 |
-
# sim = X.dot(q_vec.T).A.ravel() / (||X|| * ||q||)
|
| 742 |
-
# But we didn’t keep X to save RAM; thus fallback to building a temp NN:
|
| 743 |
-
# Since we can't access X here, safer path: inform limited ANN for query, fallback to vectorizer NN path.
|
| 744 |
return pd.DataFrame(), q_terms
|
| 745 |
else:
|
| 746 |
return pd.DataFrame(), q_terms
|
|
|
|
| 308 |
rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
|
| 309 |
dir_attr = ' dir="rtl"' if rtl else ""
|
| 310 |
|
| 311 |
+
# PRECOMPUTE to avoid backslashes inside f-string expressions
|
| 312 |
+
body_html = body_h.replace("\n", "<br/>")
|
| 313 |
+
|
| 314 |
tag_html = ""
|
| 315 |
if isinstance(tags, list) and tags:
|
| 316 |
tag_html = " ".join([f'<span class="tag">{t}</span>' for t in tags])
|
| 317 |
|
| 318 |
cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
|
| 319 |
|
| 320 |
+
html = (
|
| 321 |
+
f'<div class="email-card">'
|
| 322 |
+
f' <div class="email-header">'
|
| 323 |
+
f' <div>'
|
| 324 |
+
f' <div class="subject">{subject_h or "(no subject)"}</div>'
|
| 325 |
+
f' <div class="meta">From: <b>{from_email}</b> • Date: {date or "—"}</div>'
|
| 326 |
+
f' </div>'
|
| 327 |
+
f' <div class="badges">'
|
| 328 |
+
f' {cluster_html}'
|
| 329 |
+
f' <span class="sentiment">sentiment: <b>{sentiment}</b></span>'
|
| 330 |
+
f' {tag_html}'
|
| 331 |
+
f' </div>'
|
| 332 |
+
f' </div>'
|
| 333 |
+
f' <div class="email-body"{dir_attr}>'
|
| 334 |
+
f' {body_html}'
|
| 335 |
+
f' </div>'
|
| 336 |
+
f'</div>'
|
| 337 |
+
)
|
| 338 |
return html
|
| 339 |
|
| 340 |
def top_terms_per_cluster(X, labels, vectorizer, topn=6):
|
|
|
|
| 716 |
# Vectorize the query
|
| 717 |
q_vec = vec.transform([q])
|
| 718 |
if use_lsa_flag and X_reduced is not None:
|
| 719 |
+
# Ideally, project q with the same SVD+Normalizer; since we didn't persist them,
|
| 720 |
+
# we fall back to the TF-IDF brute-force path below.
|
| 721 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
|
| 723 |
# If we have a sklearn NearestNeighbors (cosine brute-force)
|
| 724 |
if isinstance(index_obj, NearestNeighbors):
|
|
|
|
| 728 |
results = df.iloc[inds].copy()
|
| 729 |
results["score"] = sims
|
| 730 |
elif FAISS_OK and isinstance(index_obj, faiss.Index):
|
| 731 |
+
# Can't project the query into LSA here without the SVD/Normalizer objects;
|
| 732 |
+
# so skip ANN for ad-hoc queries and return no results (or implement a TF-IDF fallback if you keep X).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 733 |
return pd.DataFrame(), q_terms
|
| 734 |
else:
|
| 735 |
return pd.DataFrame(), q_terms
|