wuhp commited on
Commit
f97dfc3
·
verified ·
1 Parent(s): 042f4b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -44
app.py CHANGED
@@ -103,6 +103,17 @@ OFFCHANNEL_RE = re.compile("|".join(OFFCHANNEL_PATTERNS), re.I)
103
  # Common personal mail domains (used with user-specified trusted org domains)
104
  PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
105
 
 
 
 
 
 
 
 
 
 
 
 
106
  # Optional seeded themes for semi-supervised init (used only when LSA is ON)
107
  CORR_LEX = {
108
  "kickback" : ["kickback","bribe","under the table","gift","cash"],
@@ -111,7 +122,7 @@ CORR_LEX = {
111
  "money_flow" : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
112
  }
113
 
114
- # =================== Label cleanup helpers (unchanged core) ===================
115
  EN_STOP = {
116
  "the","of","and","to","in","is","for","on","at","with","from","by","or","as",
117
  "that","this","it","be","are","was","were","an","a","you","your","we","our","us",
@@ -522,37 +533,79 @@ def enrich_text(row: pd.Series) -> str:
522
  if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
523
  if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
524
  if OFFCHANNEL_RE.search(t): tokens.append("__OFF_CHANNEL__")
 
 
525
  return (t + " " + " ".join(tokens)).strip()
526
 
527
- # =================== Cluster labeling: PMI bigrams ===================
528
  def cluster_labels_pmi_bigram(texts, labels, topn=6):
529
- def bigrams(t):
530
- toks = re.findall(TOKEN_PATTERN, t.lower())
531
- return [" ".join(p) for p in zip(toks, toks[1:])]
532
- N = len(texts)
533
- from collections import Counter
534
  import math as _math
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  glob_bg = Counter()
536
- per_c = {int(c): Counter() for c in np.unique(labels)}
537
- for t, c in zip(texts, labels):
538
- bgs = set(bigrams(t))
 
 
 
539
  glob_bg.update(bgs)
540
- per_c[int(c)].update(bgs)
 
 
541
  labels_out = {}
542
- total_bg = sum(glob_bg.values()) + 1e-9
543
- for c in np.unique(labels):
544
- c = int(c)
 
545
  scores = []
546
- total_c = sum(per_c[c].values()) + 1e-9
547
- for bg, cnt in per_c[c].most_common(1000):
548
  p_bg_c = cnt / total_c
549
  p_bg = (glob_bg[bg] / total_bg)
550
  if p_bg > 0 and p_bg_c > 0:
551
  score = _math.log(p_bg_c) - _math.log(p_bg)
552
  scores.append((score, bg))
553
  scores.sort(reverse=True)
554
- top = [bg for _, bg in scores[:topn]]
555
- labels_out[c] = ", ".join(top) if top else f"cluster_{c}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
556
  return labels_out
557
 
558
  # =================== Auto-k & merge ===================
@@ -707,13 +760,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
707
  with gr.Accordion("Vectorization & Clustering", open=True):
708
  with gr.Row():
709
  max_features = gr.Number(label="Word max_features (BM25)", value=120_000, precision=0)
710
- min_df = gr.Number(label="min_df (doc freq ≥)", value=2, precision=0)
711
  max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
712
  use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
713
  skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
714
  with gr.Row():
715
  use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
716
- lsa_dim = gr.Number(label="LSA components", value=150, precision=0)
717
  auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
718
  k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
719
  mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
@@ -878,7 +931,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
878
  # trusted org domains
879
  trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
880
  extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
881
- # extend SUSPECT_PHRASES runtime (no mutation of constant list)
882
  extra_terms_lower = [t.lower() for t in extra_terms]
883
 
884
  recs = _load_json_records(inbox_file.name)
@@ -921,8 +974,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
921
  flags.append(f)
922
  df["flags"] = flags
923
 
924
- # Enriched texts (adds __HAS_*__ flags)
925
- texts = list(df.apply(enrich_text, axis=1))
 
 
 
 
 
926
 
927
  # === Vectorization ===
928
  ngram_range = (1, 2) if use_bigrams else (1, 1)
@@ -946,7 +1004,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
946
  )
947
  X_char = char_vec.fit_transform(texts)
948
 
949
- X_full = hstack([X_word, X_char], format="csr")
 
950
  d_word = X_word.shape[1]
951
  d_char = X_char.shape[1]
952
  d_full = X_full.shape[1]
@@ -965,16 +1024,14 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
965
  gc.collect()
966
 
967
  # Optional anomaly detection (on LSA space)
968
- anomaly_scores = np.full((len(df),), np.nan, dtype=np.float32)
969
  if use_lsa and bool(use_iso) and ISO_OK and X_reduced is not None and X_reduced.shape[0] >= 50:
970
  try:
971
  iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
972
  iso.fit(X_reduced)
973
- # higher is less anomalous; convert to anomaly score = -score
974
- anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32)
975
  except Exception:
976
  pass
977
- df["anomaly_score"] = anomaly_scores
978
 
979
  # K selection
980
  if bool(auto_k):
@@ -1006,20 +1063,29 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1006
  )
1007
  labels = kmeans.fit_predict(X_space)
1008
 
1009
- # Merge very-similar clusters (LSA only)
1010
  if use_lsa:
1011
- labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.92)
1012
 
1013
- df["cluster_id"] = labels
1014
-
1015
- # Cluster names
1016
  term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
1017
- df["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
 
 
 
 
 
 
 
1018
 
1019
- # CorruptionScore (now uses trusted domains)
 
 
 
1020
  df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
1021
 
1022
- # Build search index
1023
  use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
1024
  index_obj = None
1025
  if use_faiss:
@@ -1033,7 +1099,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1033
 
1034
  # Summaries
1035
  cluster_counts = (
1036
- df.groupby(["cluster_id", "cluster_name"]).size()
 
1037
  .reset_index(name="count")
1038
  .sort_values("count", ascending=False)
1039
  .head(500)
@@ -1041,6 +1108,15 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1041
  cluster_counts["label"] = cluster_counts.apply(
1042
  lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
1043
  )
 
 
 
 
 
 
 
 
 
1044
  cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
1045
 
1046
  domain_counts = (
@@ -1084,7 +1160,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1084
 
1085
  status_md = (
1086
  f"**Processed {len(df):,} emails** \n"
1087
- f"Word feats (BM25): {d_word:,} | Char feats: {d_char:,} | Total: {d_full:,} \n"
1088
  f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
1089
  f"k = {k} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'} | "
1090
  f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
@@ -1143,7 +1219,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1143
  elif by == "anomaly_score" and "anomaly_score" in tmp.columns:
1144
  tmp = tmp.sort_values(["anomaly_score","_dt"], ascending=[asc, not asc])
1145
  else:
1146
- # corruption_score or search_score (if present)
1147
  col = by if by in tmp.columns else "corruption_score"
1148
  tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
1149
  tmp = tmp.drop(columns=["_dt"])
@@ -1216,16 +1291,16 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1216
  q_emb = q_vec_full
1217
 
1218
  if isinstance(index_obj, NearestNeighbors):
1219
- distances, indices = index_obj.kneighbors(q_emb, n_neighbors=min(50, len(df)))
1220
  inds = indices[0]
1221
  sims = 1.0 - distances[0]
1222
- results = df.iloc[inds].copy()
1223
  results["search_score"] = sims
1224
  elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
1225
- D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df)))
1226
  inds = I[0]
1227
  sims = D[0]
1228
- results = df.iloc[inds].copy()
1229
  results["search_score"] = sims
1230
  else:
1231
  return pd.DataFrame(), q_terms
@@ -1238,7 +1313,6 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1238
  if sort_by == "search_score":
1239
  results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
1240
  else:
1241
- # use blended but keep sort_by if chosen
1242
  if sort_by in results.columns:
1243
  results = results.sort_values([sort_by,"_blend"], ascending=[(sort_dir=="asc"), False])
1244
  else:
 
103
  # Common personal mail domains (used with user-specified trusted org domains)
104
  PERSONAL_DOMAINS = {"gmail.com","yahoo.com","outlook.com","hotmail.com","proton.me","protonmail.com","icloud.com","mail.ru","yandex.ru"}
105
 
106
+ # Newsletter/newswire heuristics
107
+ NEWS_DOMAINS = {"nytimes.com","ft.com","wsj.com","bloomberg.com","reuters.com","theguardian.com","economist.com"}
108
+ def is_news_like(subject: str, body: str, from_domain: str) -> bool:
109
+ s = (subject or "").lower()
110
+ b = (body or "").lower()
111
+ fd = (from_domain or "").lower()
112
+ if "unsubscribe" in b or "manage preferences" in b: return True
113
+ if any(k in s for k in ["daily briefing","morning update","newsletter","top stories"]): return True
114
+ if any(d in fd for d in NEWS_DOMAINS): return True
115
+ return False
116
+
117
  # Optional seeded themes for semi-supervised init (used only when LSA is ON)
118
  CORR_LEX = {
119
  "kickback" : ["kickback","bribe","under the table","gift","cash"],
 
122
  "money_flow" : ["wire transfer","transfer","swift","iban","routing number","account number","cash"]
123
  }
124
 
125
+ # =================== Label cleanup helpers ===================
126
  EN_STOP = {
127
  "the","of","and","to","in","is","for","on","at","with","from","by","or","as",
128
  "that","this","it","be","are","was","were","an","a","you","your","we","our","us",
 
533
  if INVOICE_RE.search(t): tokens.append("__HAS_INVOICE__")
534
  if COMPANY_RE.search(t): tokens.append("__HAS_COMPANY__")
535
  if OFFCHANNEL_RE.search(t): tokens.append("__OFF_CHANNEL__")
536
+ lang_tok = f'__LANG_{(row.get("lang") or "unk").lower()}__'
537
+ tokens.append(lang_tok)
538
  return (t + " " + " ".join(tokens)).strip()
539
 
540
+ # =================== Cluster labeling: improved PMI + class-TFIDF ===================
541
  def cluster_labels_pmi_bigram(texts, labels, topn=6):
 
 
 
 
 
542
  import math as _math
543
+ from collections import Counter, defaultdict
544
+ from sklearn.feature_extraction.text import TfidfVectorizer
545
+
546
+ HEADER_STOP = {"subject","re","fw","fwd","to","cc","bcc","from","sent","forwarded","回复","主题","收件人","发件人"}
547
+
548
+ def is_junk_token(tok: str) -> bool:
549
+ if _is_junk_term(tok): return True
550
+ tl = tok.lower()
551
+ if "@" in tl: return True
552
+ if tl in HEADER_STOP: return True
553
+ if re.search(r"[^\w\-']", tl): # punctuation blobs
554
+ if "’" not in tl and "'" not in tl:
555
+ return True
556
+ return False
557
+
558
+ def tokenize_clean(t):
559
+ toks = re.findall(TOKEN_PATTERN, t.lower())
560
+ return [w for w in toks if not is_junk_token(w)]
561
+
562
+ def bigrams(toks):
563
+ return [" ".join(p) for p in zip(toks, toks[1:]) if all(not is_junk_token(x) for x in p)]
564
+
565
  glob_bg = Counter()
566
+ per_c_bg = defaultdict(Counter)
567
+ per_c_texts = defaultdict(list)
568
+
569
+ for txt, c in zip(texts, labels):
570
+ toks = tokenize_clean(txt)
571
+ bgs = set(bigrams(toks))
572
  glob_bg.update(bgs)
573
+ per_c_bg[int(c)].update(bgs)
574
+ per_c_texts[int(c)].append(" ".join(toks))
575
+
576
  labels_out = {}
577
+ total_bg = sum(glob_bg.values()) + 1e-12
578
+
579
+ for c in sorted(set(int(x) for x in labels)):
580
+ # PMI bigrams
581
  scores = []
582
+ total_c = sum(per_c_bg[c].values()) + 1e-12
583
+ for bg, cnt in per_c_bg[c].most_common(2000):
584
  p_bg_c = cnt / total_c
585
  p_bg = (glob_bg[bg] / total_bg)
586
  if p_bg > 0 and p_bg_c > 0:
587
  score = _math.log(p_bg_c) - _math.log(p_bg)
588
  scores.append((score, bg))
589
  scores.sort(reverse=True)
590
+ top_bi = [bg for _, bg in scores[: max(2, topn//2) ]]
591
+
592
+ # class-TFIDF unigrams (cluster doc vs. background doc)
593
+ docs_c = [" ".join(per_c_texts[c])] if per_c_texts[c] else [" "]
594
+ docs_bg = [" ".join(sum((per_c_texts[k] for k in per_c_texts if k!=c), [])) or " "]
595
+ corpus = [docs_c[0], docs_bg[0]]
596
+ vec = TfidfVectorizer(
597
+ analyzer="word", ngram_range=(1,1),
598
+ max_features=3000, token_pattern=TOKEN_PATTERN, lowercase=True
599
+ )
600
+ X = vec.fit_transform(corpus)
601
+ vocab = np.array(sorted(vec.vocabulary_, key=lambda k: vec.vocabulary_[k]))
602
+ row = X[0].toarray().ravel()
603
+ top_idx = row.argsort()[::-1][: max(0, topn - len(top_bi)) ]
604
+ top_uni = [t for t in vocab[top_idx] if not is_junk_token(t)][: max(0, topn - len(top_bi)) ]
605
+
606
+ parts = top_bi + top_uni
607
+ labels_out[c] = ", ".join(parts) if parts else f"cluster_{c}"
608
+
609
  return labels_out
610
 
611
  # =================== Auto-k & merge ===================
 
760
  with gr.Accordion("Vectorization & Clustering", open=True):
761
  with gr.Row():
762
  max_features = gr.Number(label="Word max_features (BM25)", value=120_000, precision=0)
763
+ min_df = gr.Number(label="min_df (doc freq ≥)", value=3, precision=0) # tightened default
764
  max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.7, step=0.05)
765
  use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
766
  skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
767
  with gr.Row():
768
  use_lsa = gr.Checkbox(label="Use LSA (TruncatedSVD) before KMeans", value=True)
769
+ lsa_dim = gr.Number(label="LSA components", value=256, precision=0) # richer default
770
  auto_k = gr.Checkbox(label="Auto choose k (kneedle)", value=True)
771
  k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=350, precision=0)
772
  mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
 
931
  # trusted org domains
932
  trusted = set([d.strip().lower() for d in (trusted_domains_in or "").split(",") if d.strip()])
933
  extra_terms = [t.strip() for t in (extra_keywords_in or "").split(",") if t.strip()]
934
+ # extend phrases at runtime
935
  extra_terms_lower = [t.lower() for t in extra_terms]
936
 
937
  recs = _load_json_records(inbox_file.name)
 
974
  flags.append(f)
975
  df["flags"] = flags
976
 
977
+ # Identify news-like messages and separate them out before clustering
978
+ df["is_news"] = df.apply(lambda r: is_news_like(r.get("subject",""), r.get("body_text",""), r.get("from_domain","")), axis=1)
979
+ df_main = df[~df["is_news"]].reset_index(drop=True)
980
+ df_news = df[df["is_news"]].reset_index(drop=True)
981
+
982
+ # Enriched texts (adds __HAS_*__ flags + __LANG__)
983
+ texts = list(df_main.apply(enrich_text, axis=1))
984
 
985
  # === Vectorization ===
986
  ngram_range = (1, 2) if use_bigrams else (1, 1)
 
1004
  )
1005
  X_char = char_vec.fit_transform(texts)
1006
 
1007
+ # Down-weight char-grams so they don't dominate geometry
1008
+ X_full = hstack([X_word, X_char * 0.4], format="csr")
1009
  d_word = X_word.shape[1]
1010
  d_char = X_char.shape[1]
1011
  d_full = X_full.shape[1]
 
1024
  gc.collect()
1025
 
1026
  # Optional anomaly detection (on LSA space)
1027
+ anomaly_scores = np.full((len(df_main),), np.nan, dtype=np.float32)
1028
  if use_lsa and bool(use_iso) and ISO_OK and X_reduced is not None and X_reduced.shape[0] >= 50:
1029
  try:
1030
  iso = IsolationForest(n_estimators=100, contamination="auto", random_state=0)
1031
  iso.fit(X_reduced)
1032
+ anomaly_scores = (-iso.score_samples(X_reduced)).astype(np.float32) # higher = more anomalous
 
1033
  except Exception:
1034
  pass
 
1035
 
1036
  # K selection
1037
  if bool(auto_k):
 
1063
  )
1064
  labels = kmeans.fit_predict(X_space)
1065
 
1066
+ # Merge very-similar clusters (LSA only) — slightly stricter
1067
  if use_lsa:
1068
+ labels = merge_close_clusters(labels, kmeans.cluster_centers_, thresh=0.94)
1069
 
1070
+ # Attach clustering back to df_main
1071
+ df_main["cluster_id"] = labels
 
1072
  term_names = cluster_labels_pmi_bigram(texts, labels, topn=6)
1073
+ df_main["cluster_name"] = [term_names.get(int(c), f"cluster_{int(c)}") for c in labels]
1074
+ df_main["anomaly_score"] = anomaly_scores
1075
+
1076
+ # Newsletter/newswire rows: assign a special cluster
1077
+ if len(df_news):
1078
+ df_news["cluster_id"] = -1
1079
+ df_news["cluster_name"] = "newsletter/news"
1080
+ df_news["anomaly_score"] = np.nan
1081
 
1082
+ # Combine back
1083
+ df = pd.concat([df_main, df_news], ignore_index=True)
1084
+
1085
+ # CorruptionScore (uses trusted domains)
1086
  df["corruption_score"] = df.apply(lambda r: corruption_score(r, trusted_domains=trusted), axis=1)
1087
 
1088
+ # Build search index on clustered subset only
1089
  use_faiss = bool(use_faiss) and FAISS_OK and use_lsa and (X_reduced is not None)
1090
  index_obj = None
1091
  if use_faiss:
 
1099
 
1100
  # Summaries
1101
  cluster_counts = (
1102
+ df[df["cluster_id"] != -1]
1103
+ .groupby(["cluster_id", "cluster_name"]).size()
1104
  .reset_index(name="count")
1105
  .sort_values("count", ascending=False)
1106
  .head(500)
 
1108
  cluster_counts["label"] = cluster_counts.apply(
1109
  lambda r: f'{int(r["cluster_id"])} — {r["cluster_name"]} ({int(r["count"])})', axis=1
1110
  )
1111
+ # Optionally append newsletter bucket
1112
+ news_count = int((df["cluster_id"] == -1).sum())
1113
+ if news_count > 0:
1114
+ cluster_counts = pd.concat([
1115
+ cluster_counts,
1116
+ pd.DataFrame([{"cluster_id": -1, "cluster_name": "newsletter/news", "count": news_count,
1117
+ "label": f'-1 — newsletter/news ({news_count})'}])
1118
+ ], ignore_index=True)
1119
+
1120
  cluster_choices = ["(any)"] + cluster_counts["label"].tolist()
1121
 
1122
  domain_counts = (
 
1160
 
1161
  status_md = (
1162
  f"**Processed {len(df):,} emails** \n"
1163
+ f"Word feats (BM25): {d_word:,} | Char feats: {d_char:,} (x0.4) | Total: {d_full:,} \n"
1164
  f"{'LSA: ' + str(X_reduced.shape[1]) + ' dims | ' if use_lsa else ''}"
1165
  f"k = {k} | Search = {'Faiss (IP on LSA)' if use_faiss else 'cosine brute-force'} | "
1166
  f"Anomaly = {'ON' if (use_lsa and use_iso and ISO_OK) else 'OFF'}"
 
1219
  elif by == "anomaly_score" and "anomaly_score" in tmp.columns:
1220
  tmp = tmp.sort_values(["anomaly_score","_dt"], ascending=[asc, not asc])
1221
  else:
 
1222
  col = by if by in tmp.columns else "corruption_score"
1223
  tmp = tmp.sort_values([col,"_dt"], ascending=[asc, not asc])
1224
  tmp = tmp.drop(columns=["_dt"])
 
1291
  q_emb = q_vec_full
1292
 
1293
  if isinstance(index_obj, NearestNeighbors):
1294
+ distances, indices = index_obj.kneighbors(q_emb, n_neighbors=min(50, len(df[df["cluster_id"]!=-1])))
1295
  inds = indices[0]
1296
  sims = 1.0 - distances[0]
1297
+ results = df[df["cluster_id"]!=-1].iloc[inds].copy()
1298
  results["search_score"] = sims
1299
  elif FAISS_OK and use_faiss_flag and isinstance(index_obj, faiss.Index):
1300
+ D, I = index_obj.search(q_emb.astype(np.float32), min(50, len(df[df["cluster_id"]!=-1])))
1301
  inds = I[0]
1302
  sims = D[0]
1303
+ results = df[df["cluster_id"]!=-1].iloc[inds].copy()
1304
  results["search_score"] = sims
1305
  else:
1306
  return pd.DataFrame(), q_terms
 
1313
  if sort_by == "search_score":
1314
  results = results.sort_values("search_score", ascending=(sort_dir=="asc"))
1315
  else:
 
1316
  if sort_by in results.columns:
1317
  results = results.sort_values([sort_by,"_blend"], ascending=[(sort_dir=="asc"), False])
1318
  else: