wuhp commited on
Commit
30c3c12
·
verified ·
1 Parent(s): d654de9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -12
app.py CHANGED
@@ -1039,7 +1039,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1039
  with gr.Row():
1040
  cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
1041
  domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
1042
-
1043
  with gr.Row():
1044
  sender_counts_df = gr.Dataframe(label="Top senders", interactive=False, wrap=True) # NEW
1045
 
@@ -1191,8 +1191,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1191
  Return (X_full_csr, count_vec, char_vec, bm25, d_word, d_char, d_full)
1192
  Uses Count+BM25 (+ char-tfidf) or Hashing+TfidfTransformer (+ char-tfidf).
1193
  """
 
1194
  ngram_range = (1, 2) if use_bigrams else (1, 1)
1195
 
 
 
 
 
1196
  if use_hashing:
1197
  # HashingVectorizer -> counts -> TfidfTransformer (as IDF) -> BM25-like not exact.
1198
  # We’ll keep TF-IDF for words (IDF approximates) + char-grams TF-IDF (unchanged).
@@ -1212,7 +1217,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1212
 
1213
  char_vec = CharTfidf(
1214
  analyzer="char", ngram_range=(3, 5),
1215
- min_df=2, max_features=100_000, lowercase=True, dtype=np.float32
1216
  )
1217
  X_char = char_vec.fit_transform(texts)
1218
  X_full = hstack([X_word, X_char * 0.20], format="csr")
@@ -1237,7 +1242,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1237
 
1238
  char_vec = CharTfidf(
1239
  analyzer="char", ngram_range=(3, 5),
1240
- min_df=2, max_features=100_000, lowercase=True, dtype=np.float32
1241
  )
1242
  X_char = char_vec.fit_transform(texts)
1243
  X_full = hstack([X_word, X_char * 0.20], format="csr")
@@ -1533,7 +1538,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1533
  index_obj = nn
1534
  except Exception:
1535
  index_obj = None
1536
-
1537
  # Summaries
1538
  cluster_counts = (
1539
  df[df["cluster_id"] >= 0]
@@ -1624,7 +1629,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1624
  domain_update = gr.update(choices=domain_choices, value="(any)")
1625
  sender_update = gr.update(choices=sender_choices, value="(any)")
1626
  lang_update = gr.update(choices=lang_choices, value="(any)")
1627
-
1628
  # NOTE: svd_obj and norm_obj are now local to partitions. We can only pass back the single-partition one for search.
1629
  svd_obj_out = svd_obj_local if single_partition and 'svd_obj_local' in locals() else None
1630
  norm_obj_out = norm_obj_local if single_partition and 'norm_obj_local' in locals() else None
@@ -1734,6 +1739,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1734
  def _vectorize_query(q: str, vec_state: Dict[str, Any], corpus_texts_for_fit: List[str]):
1735
  # This has to re-fit a vectorizer to project the query, since partitioning means
1736
  # we don't have a single global one.
 
1737
  if vec_state.get("use_hashing"):
1738
  hv = HashingVectorizer(
1739
  analyzer="word", ngram_range=(1,2) if vec_state.get('use_bigrams') else (1,1),
@@ -1758,7 +1764,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1758
  q_word = bm25.transform(q_word_tf)
1759
 
1760
  char_vec = CharTfidf(
1761
- analyzer="char", ngram_range=(3,5), min_df=2, max_features=100_000,
1762
  lowercase=True, dtype=np.float32
1763
  ).fit(corpus_texts_for_fit)
1764
 
@@ -1769,8 +1775,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1769
  def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
1770
  if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
1771
  return pd.DataFrame(), []
1772
-
1773
- # align with df_main order (exclude -1 and -2)
1774
  mask = ~df["cluster_id"].isin([-1, -2, -3])
1775
  filtered_df = df[mask].reset_index(drop=True)
1776
  if filtered_df.empty:
@@ -1785,10 +1791,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1785
  return pd.DataFrame(), q_terms
1786
  else:
1787
  q_emb = q_vec_full
1788
-
1789
  n_neighbors = min(50, len(filtered_df))
1790
  if n_neighbors <= 0: return pd.DataFrame(), q_terms
1791
-
1792
  if isinstance(index_obj, NearestNeighbors):
1793
  # Check if index was fit on placeholder
1794
  if hasattr(index_obj, 'n_samples_fit_') and index_obj.n_samples_fit_ <= 1:
@@ -1874,7 +1880,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1874
  return "(any)"
1875
  label = df_sum.iloc[row_idx]["label"]
1876
  return label if isinstance(label, str) else "(any)"
1877
-
1878
  # Click domain summary to filter
1879
  def on_domain_click(evt: gr.SelectData, df_sum: pd.DataFrame):
1880
  try:
@@ -1895,7 +1901,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
1895
  inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
1896
  outputs=[results_df]
1897
  )
1898
-
1899
  domain_counts_df.select(on_domain_click, inputs=[domain_counts_df], outputs=[domain_drop]) \
1900
  .then(
1901
  refresh_results,
 
1039
  with gr.Row():
1040
  cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
1041
  domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
1042
+
1043
  with gr.Row():
1044
  sender_counts_df = gr.Dataframe(label="Top senders", interactive=False, wrap=True) # NEW
1045
 
 
1191
  Return (X_full_csr, count_vec, char_vec, bm25, d_word, d_char, d_full)
1192
  Uses Count+BM25 (+ char-tfidf) or Hashing+TfidfTransformer (+ char-tfidf).
1193
  """
1194
+ n_docs = len(texts)
1195
  ngram_range = (1, 2) if use_bigrams else (1, 1)
1196
 
1197
+ # FIX: Adapt min_df for char vectorizer when processing very small partitions (like a single doc)
1198
+ # This prevents the ValueError from sklearn.
1199
+ char_min_df = 1 if n_docs <= 1 else 2
1200
+
1201
  if use_hashing:
1202
  # HashingVectorizer -> counts -> TfidfTransformer (as IDF) -> BM25-like not exact.
1203
  # We’ll keep TF-IDF for words (IDF approximates) + char-grams TF-IDF (unchanged).
 
1217
 
1218
  char_vec = CharTfidf(
1219
  analyzer="char", ngram_range=(3, 5),
1220
+ min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32 # <-- USED HERE
1221
  )
1222
  X_char = char_vec.fit_transform(texts)
1223
  X_full = hstack([X_word, X_char * 0.20], format="csr")
 
1242
 
1243
  char_vec = CharTfidf(
1244
  analyzer="char", ngram_range=(3, 5),
1245
+ min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32 # <-- USED HERE
1246
  )
1247
  X_char = char_vec.fit_transform(texts)
1248
  X_full = hstack([X_word, X_char * 0.20], format="csr")
 
1538
  index_obj = nn
1539
  except Exception:
1540
  index_obj = None
1541
+
1542
  # Summaries
1543
  cluster_counts = (
1544
  df[df["cluster_id"] >= 0]
 
1629
  domain_update = gr.update(choices=domain_choices, value="(any)")
1630
  sender_update = gr.update(choices=sender_choices, value="(any)")
1631
  lang_update = gr.update(choices=lang_choices, value="(any)")
1632
+
1633
  # NOTE: svd_obj and norm_obj are now local to partitions. We can only pass back the single-partition one for search.
1634
  svd_obj_out = svd_obj_local if single_partition and 'svd_obj_local' in locals() else None
1635
  norm_obj_out = norm_obj_local if single_partition and 'norm_obj_local' in locals() else None
 
1739
  def _vectorize_query(q: str, vec_state: Dict[str, Any], corpus_texts_for_fit: List[str]):
1740
  # This has to re-fit a vectorizer to project the query, since partitioning means
1741
  # we don't have a single global one.
1742
+ char_min_df = 1 if len(corpus_texts_for_fit) <=1 else 2
1743
  if vec_state.get("use_hashing"):
1744
  hv = HashingVectorizer(
1745
  analyzer="word", ngram_range=(1,2) if vec_state.get('use_bigrams') else (1,1),
 
1764
  q_word = bm25.transform(q_word_tf)
1765
 
1766
  char_vec = CharTfidf(
1767
+ analyzer="char", ngram_range=(3,5), min_df=char_min_df, max_features=100_000,
1768
  lowercase=True, dtype=np.float32
1769
  ).fit(corpus_texts_for_fit)
1770
 
 
1775
  def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
1776
  if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
1777
  return pd.DataFrame(), []
1778
+
1779
+ # align with df_main order (exclude -1, -2, -3)
1780
  mask = ~df["cluster_id"].isin([-1, -2, -3])
1781
  filtered_df = df[mask].reset_index(drop=True)
1782
  if filtered_df.empty:
 
1791
  return pd.DataFrame(), q_terms
1792
  else:
1793
  q_emb = q_vec_full
1794
+
1795
  n_neighbors = min(50, len(filtered_df))
1796
  if n_neighbors <= 0: return pd.DataFrame(), q_terms
1797
+
1798
  if isinstance(index_obj, NearestNeighbors):
1799
  # Check if index was fit on placeholder
1800
  if hasattr(index_obj, 'n_samples_fit_') and index_obj.n_samples_fit_ <= 1:
 
1880
  return "(any)"
1881
  label = df_sum.iloc[row_idx]["label"]
1882
  return label if isinstance(label, str) else "(any)"
1883
+
1884
  # Click domain summary to filter
1885
  def on_domain_click(evt: gr.SelectData, df_sum: pd.DataFrame):
1886
  try:
 
1901
  inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
1902
  outputs=[results_df]
1903
  )
1904
+
1905
  domain_counts_df.select(on_domain_click, inputs=[domain_counts_df], outputs=[domain_drop]) \
1906
  .then(
1907
  refresh_results,