Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Aug 31, 2025

Commit

2ebeb60

verified ·

1 Parent(s): fac591d

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -37

app.py CHANGED Viewed

@@ -308,30 +308,33 @@ def build_highlighted_html(row: pd.Series, query_terms: Optional[List[str]] = No
     rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
     dir_attr = ' dir="rtl"' if rtl else ""
     tag_html = ""
     if isinstance(tags, list) and tags:
         tag_html = " ".join([f'<span class="tag">{t}</span>' for t in tags])
     cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
-    html = f"""
-    <div class="email-card">
-      <div class="email-header">
-        <div>
-          <div class="subject">{subject_h or "(no subject)"}</div>
-          <div class="meta">From: <b>{from_email}</b> • Date: {date or "—"}</div>
-        </div>
-        <div class="badges">
-          {cluster_html}
-          <span class="sentiment">sentiment: <b>{sentiment}</b></span>
-          {tag_html}
-        </div>
-      </div>
-      <div class="email-body" {dir_attr}>
-        {body_h.replace('\n','<br/>')}
-      </div>
-    </div>
-    """
     return html
 def top_terms_per_cluster(X, labels, vectorizer, topn=6):
@@ -713,14 +716,9 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         # Vectorize the query
         q_vec = vec.transform([q])
         if use_lsa_flag and X_reduced is not None:
-            # Project q into LSA space using the same SVD+Normalizer is ideal,
-            # but we didn't return SVD/Normalizer objects to minimize memory.
-            # Approximation: use the KNN over TF-IDF if Faiss (LSA) not available.
-            if use_faiss_flag and isinstance(index_obj, faiss.IndexFlatIP):
-                # We need the same SVD+Normalizer to project q; since we didn’t persist them,
-                # fallback gracefully to TF-IDF brute-force nearest neighbors.
-                # So here: if Faiss present but we can't project q, we simply fallback below.
-                pass
         # If we have a sklearn NearestNeighbors (cosine brute-force)
         if isinstance(index_obj, NearestNeighbors):
@@ -730,17 +728,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             results = df.iloc[inds].copy()
             results["score"] = sims
         elif FAISS_OK and isinstance(index_obj, faiss.Index):
-            # We cannot re-compute SVD projection here; so we approximate by doing TF-IDF brute force
-            # to avoid mismatch. This keeps correctness at the cost of speed for queries.
-            nn = NearestNeighbors(metric="cosine", algorithm="brute")
-            nn.fit(q_vec.__class__(q_vec))  # no-op to appease types
-            # build a temporary NN on the corpus TF-IDF
-            nn = NearestNeighbors(metric="cosine", algorithm="brute")
-            # Fit once per search is heavy; instead, do manual cosine on sparse matrix:
-            # Efficient manual sparse cosine for 1 query:
-            # sim = X.dot(q_vec.T).A.ravel() / (||X|| * ||q||)
-            # But we didn’t keep X to save RAM; thus fallback to building a temp NN:
-            # Since we can't access X here, safer path: inform limited ANN for query, fallback to vectorizer NN path.
             return pd.DataFrame(), q_terms
         else:
             return pd.DataFrame(), q_terms

     rtl = bool(re.search(r"[\u0590-\u08FF]", body_h))
     dir_attr = ' dir="rtl"' if rtl else ""
+    # PRECOMPUTE to avoid backslashes inside f-string expressions
+    body_html = body_h.replace("\n", "<br/>")
     tag_html = ""
     if isinstance(tags, list) and tags:
         tag_html = " ".join([f'<span class="tag">{t}</span>' for t in tags])
     cluster_html = f'<span class="cluster-pill">{cluster_label or ""}</span>' if cluster_label else ""
+    html = (
+        f'<div class="email-card">'
+        f'  <div class="email-header">'
+        f'    <div>'
+        f'      <div class="subject">{subject_h or "(no subject)"}</div>'
+        f'      <div class="meta">From: <b>{from_email}</b> • Date: {date or "—"}</div>'
+        f'    </div>'
+        f'    <div class="badges">'
+        f'      {cluster_html}'
+        f'      <span class="sentiment">sentiment: <b>{sentiment}</b></span>'
+        f'      {tag_html}'
+        f'    </div>'
+        f'  </div>'
+        f'  <div class="email-body"{dir_attr}>'
+        f'    {body_html}'
+        f'  </div>'
+        f'</div>'
+    )
     return html
 def top_terms_per_cluster(X, labels, vectorizer, topn=6):
         # Vectorize the query
         q_vec = vec.transform([q])
         if use_lsa_flag and X_reduced is not None:
+            # Ideally, project q with the same SVD+Normalizer; since we didn't persist them,
+            # we fall back to the TF-IDF brute-force path below.
+            pass
         # If we have a sklearn NearestNeighbors (cosine brute-force)
         if isinstance(index_obj, NearestNeighbors):
             results = df.iloc[inds].copy()
             results["score"] = sims
         elif FAISS_OK and isinstance(index_obj, faiss.Index):
+            # Can't project the query into LSA here without the SVD/Normalizer objects;
+            # so skip ANN for ad-hoc queries and return no results (or implement a TF-IDF fallback if you keep X).
             return pd.DataFrame(), q_terms
         else:
             return pd.DataFrame(), q_terms