wuhp commited on
Commit
2ebeb60
·
verified ·
1 Parent(s): fac591d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -37
app.py CHANGED
@@ -308,30 +308,33 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
308
  rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
309
  dir_attr = ' dir="rtl"' if rtl else ""
310
 
 
 
 
311
  tag_html = ""
312
  if isinstance(tags, list) and tags:
313
  tag_html = " ".join([f'<span class="tag">{t}</span>' for t in tags])
314
 
315
  cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
316
 
317
- html = f"""
318
- <div class="email-card">
319
- <div class="email-header">
320
- <div>
321
- <div class="subject">{subject_h or "(no subject)"}</div>
322
- <div class="meta">From: <b>{from_email}</b> • Date: {date or "—"}</div>
323
- </div>
324
- <div class="badges">
325
- {cluster_html}
326
- <span class="sentiment">sentiment: <b>{sentiment}</b></span>
327
- {tag_html}
328
- </div>
329
- </div>
330
- <div class="email-body" {dir_attr}>
331
- {body_h.replace('\n','<br/>')}
332
- </div>
333
- </div>
334
- """
335
  return html
336
 
337
  def top_terms_per_cluster(X, labels, vectorizer, topn=6):
@@ -713,14 +716,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
713
  # Vectorize the query
714
  q_vec = vec.transform([q])
715
  if use_lsa_flag and X_reduced is not None:
716
- # Project q into LSA space using the same SVD+Normalizer is ideal,
717
- # but we didn't return SVD/Normalizer objects to minimize memory.
718
- # Approximation: use the KNN over TF-IDF if Faiss (LSA) not available.
719
- if use_faiss_flag and isinstance(index_obj, faiss.IndexFlatIP):
720
- # We need the same SVD+Normalizer to project q; since we didn’t persist them,
721
- # fallback gracefully to TF-IDF brute-force nearest neighbors.
722
- # So here: if Faiss present but we can't project q, we simply fallback below.
723
- pass
724
 
725
  # If we have a sklearn NearestNeighbors (cosine brute-force)
726
  if isinstance(index_obj, NearestNeighbors):
@@ -730,17 +728,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
730
  results = df.iloc[inds].copy()
731
  results["score"] = sims
732
  elif FAISS_OK and isinstance(index_obj, faiss.Index):
733
- # We cannot re-compute SVD projection here; so we approximate by doing TF-IDF brute force
734
- # to avoid mismatch. This keeps correctness at the cost of speed for queries.
735
- nn = NearestNeighbors(metric="cosine", algorithm="brute")
736
- nn.fit(q_vec.__class__(q_vec)) # no-op to appease types
737
- # build a temporary NN on the corpus TF-IDF
738
- nn = NearestNeighbors(metric="cosine", algorithm="brute")
739
- # Fit once per search is heavy; instead, do manual cosine on sparse matrix:
740
- # Efficient manual sparse cosine for 1 query:
741
- # sim = X.dot(q_vec.T).A.ravel() / (||X|| * ||q||)
742
- # But we didn’t keep X to save RAM; thus fallback to building a temp NN:
743
- # Since we can't access X here, safer path: inform limited ANN for query, fallback to vectorizer NN path.
744
  return pd.DataFrame(), q_terms
745
  else:
746
  return pd.DataFrame(), q_terms
 
308
  rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
309
  dir_attr = ' dir="rtl"' if rtl else ""
310
 
311
+ # PRECOMPUTE to avoid backslashes inside f-string expressions
312
+ body_html = body_h.replace("\n", "<br/>")
313
+
314
  tag_html = ""
315
  if isinstance(tags, list) and tags:
316
  tag_html = " ".join([f'<span class="tag">{t}</span>' for t in tags])
317
 
318
  cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
319
 
320
+ html = (
321
+ f'<div class="email-card">'
322
+ f' <div class="email-header">'
323
+ f' <div>'
324
+ f' <div class="subject">{subject_h or "(no subject)"}</div>'
325
+ f' <div class="meta">From: <b>{from_email}</b> • Date: {date or "—"}</div>'
326
+ f' </div>'
327
+ f' <div class="badges">'
328
+ f' {cluster_html}'
329
+ f' <span class="sentiment">sentiment: <b>{sentiment}</b></span>'
330
+ f' {tag_html}'
331
+ f' </div>'
332
+ f' </div>'
333
+ f' <div class="email-body"{dir_attr}>'
334
+ f' {body_html}'
335
+ f' </div>'
336
+ f'</div>'
337
+ )
338
  return html
339
 
340
  def top_terms_per_cluster(X, labels, vectorizer, topn=6):
 
716
  # Vectorize the query
717
  q_vec = vec.transform([q])
718
  if use_lsa_flag and X_reduced is not None:
719
+ # Ideally, project q with the same SVD+Normalizer; since we didn't persist them,
720
+ # we fall back to the TF-IDF brute-force path below.
721
+ pass
 
 
 
 
 
722
 
723
  # If we have a sklearn NearestNeighbors (cosine brute-force)
724
  if isinstance(index_obj, NearestNeighbors):
 
728
  results = df.iloc[inds].copy()
729
  results["score"] = sims
730
  elif FAISS_OK and isinstance(index_obj, faiss.Index):
731
+ # Can't project the query into LSA here without the SVD/Normalizer objects;
732
+ # so skip ANN for ad-hoc queries and return no results (or implement a TF-IDF fallback if you keep X).
 
 
 
 
 
 
 
 
 
733
  return pd.DataFrame(), q_terms
734
  else:
735
  return pd.DataFrame(), q_terms