Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Sep 1, 2025

Commit

30c3c12

verified ·

1 Parent(s): d654de9

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -12

app.py CHANGED Viewed

@@ -1039,7 +1039,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     with gr.Row():
         cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
         domain_counts_df  = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
     with gr.Row():
         sender_counts_df = gr.Dataframe(label="Top senders", interactive=False, wrap=True)  # NEW
@@ -1191,8 +1191,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             Return (X_full_csr, count_vec, char_vec, bm25, d_word, d_char, d_full)
             Uses Count+BM25 (+ char-tfidf) or Hashing+TfidfTransformer (+ char-tfidf).
             """
             ngram_range = (1, 2) if use_bigrams else (1, 1)
             if use_hashing:
                 # HashingVectorizer -> counts -> TfidfTransformer (as IDF) -> BM25-like not exact.
                 # We’ll keep TF-IDF for words (IDF approximates) + char-grams TF-IDF (unchanged).
@@ -1212,7 +1217,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 char_vec = CharTfidf(
                     analyzer="char", ngram_range=(3, 5),
-                    min_df=2, max_features=100_000, lowercase=True, dtype=np.float32
                 )
                 X_char = char_vec.fit_transform(texts)
                 X_full = hstack([X_word, X_char * 0.20], format="csr")
@@ -1237,7 +1242,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             char_vec = CharTfidf(
                 analyzer="char", ngram_range=(3, 5),
-                min_df=2, max_features=100_000, lowercase=True, dtype=np.float32
             )
             X_char = char_vec.fit_transform(texts)
             X_full = hstack([X_word, X_char * 0.20], format="csr")
@@ -1533,7 +1538,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                     index_obj = nn
             except Exception:
                 index_obj = None
         # Summaries
         cluster_counts = (
             df[df["cluster_id"] >= 0]
@@ -1624,7 +1629,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         domain_update  = gr.update(choices=domain_choices,  value="(any)")
         sender_update  = gr.update(choices=sender_choices,  value="(any)")
         lang_update    = gr.update(choices=lang_choices,    value="(any)")
         # NOTE: svd_obj and norm_obj are now local to partitions. We can only pass back the single-partition one for search.
         svd_obj_out = svd_obj_local if single_partition and 'svd_obj_local' in locals() else None
         norm_obj_out = norm_obj_local if single_partition and 'norm_obj_local' in locals() else None
@@ -1734,6 +1739,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     def _vectorize_query(q: str, vec_state: Dict[str, Any], corpus_texts_for_fit: List[str]):
         # This has to re-fit a vectorizer to project the query, since partitioning means
         # we don't have a single global one.
         if vec_state.get("use_hashing"):
             hv = HashingVectorizer(
                 analyzer="word", ngram_range=(1,2) if vec_state.get('use_bigrams') else (1,1),
@@ -1758,7 +1764,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             q_word = bm25.transform(q_word_tf)
         char_vec = CharTfidf(
-            analyzer="char", ngram_range=(3,5), min_df=2, max_features=100_000,
             lowercase=True, dtype=np.float32
         ).fit(corpus_texts_for_fit)
@@ -1769,8 +1775,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
     def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
         if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
             return pd.DataFrame(), []
-        # align with df_main order (exclude -1 and -2)
         mask = ~df["cluster_id"].isin([-1, -2, -3])
         filtered_df = df[mask].reset_index(drop=True)
         if filtered_df.empty:
@@ -1785,10 +1791,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
                 return pd.DataFrame(), q_terms
         else:
             q_emb = q_vec_full
         n_neighbors = min(50, len(filtered_df))
         if n_neighbors <= 0: return pd.DataFrame(), q_terms
         if isinstance(index_obj, NearestNeighbors):
             # Check if index was fit on placeholder
             if hasattr(index_obj, 'n_samples_fit_') and index_obj.n_samples_fit_ <= 1:
@@ -1874,7 +1880,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
             return "(any)"
         label = df_sum.iloc[row_idx]["label"]
         return label if isinstance(label, str) else "(any)"
     # Click domain summary to filter
     def on_domain_click(evt: gr.SelectData, df_sum: pd.DataFrame):
         try:
@@ -1895,7 +1901,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
         inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
         outputs=[results_df]
     )
     domain_counts_df.select(on_domain_click, inputs=[domain_counts_df], outputs=[domain_drop]) \
     .then(
         refresh_results,

     with gr.Row():
         cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
         domain_counts_df  = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
     with gr.Row():
         sender_counts_df = gr.Dataframe(label="Top senders", interactive=False, wrap=True)  # NEW
             Return (X_full_csr, count_vec, char_vec, bm25, d_word, d_char, d_full)
             Uses Count+BM25 (+ char-tfidf) or Hashing+TfidfTransformer (+ char-tfidf).
             """
+            n_docs = len(texts)
             ngram_range = (1, 2) if use_bigrams else (1, 1)
+            # FIX: Adapt min_df for char vectorizer when processing very small partitions (like a single doc)
+            # This prevents the ValueError from sklearn.
+            char_min_df = 1 if n_docs <= 1 else 2
             if use_hashing:
                 # HashingVectorizer -> counts -> TfidfTransformer (as IDF) -> BM25-like not exact.
                 # We’ll keep TF-IDF for words (IDF approximates) + char-grams TF-IDF (unchanged).
                 char_vec = CharTfidf(
                     analyzer="char", ngram_range=(3, 5),
+                    min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32 # <-- USED HERE
                 )
                 X_char = char_vec.fit_transform(texts)
                 X_full = hstack([X_word, X_char * 0.20], format="csr")
             char_vec = CharTfidf(
                 analyzer="char", ngram_range=(3, 5),
+                min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32 # <-- USED HERE
             )
             X_char = char_vec.fit_transform(texts)
             X_full = hstack([X_word, X_char * 0.20], format="csr")
                     index_obj = nn
             except Exception:
                 index_obj = None
         # Summaries
         cluster_counts = (
             df[df["cluster_id"] >= 0]
         domain_update  = gr.update(choices=domain_choices,  value="(any)")
         sender_update  = gr.update(choices=sender_choices,  value="(any)")
         lang_update    = gr.update(choices=lang_choices,    value="(any)")
         # NOTE: svd_obj and norm_obj are now local to partitions. We can only pass back the single-partition one for search.
         svd_obj_out = svd_obj_local if single_partition and 'svd_obj_local' in locals() else None
         norm_obj_out = norm_obj_local if single_partition and 'norm_obj_local' in locals() else None
     def _vectorize_query(q: str, vec_state: Dict[str, Any], corpus_texts_for_fit: List[str]):
         # This has to re-fit a vectorizer to project the query, since partitioning means
         # we don't have a single global one.
+        char_min_df = 1 if len(corpus_texts_for_fit) <=1 else 2
         if vec_state.get("use_hashing"):
             hv = HashingVectorizer(
                 analyzer="word", ngram_range=(1,2) if vec_state.get('use_bigrams') else (1,1),
             q_word = bm25.transform(q_word_tf)
         char_vec = CharTfidf(
+            analyzer="char", ngram_range=(3,5), min_df=char_min_df, max_features=100_000,
             lowercase=True, dtype=np.float32
         ).fit(corpus_texts_for_fit)
     def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
         if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
             return pd.DataFrame(), []
+        # align with df_main order (exclude -1, -2, -3)
         mask = ~df["cluster_id"].isin([-1, -2, -3])
         filtered_df = df[mask].reset_index(drop=True)
         if filtered_df.empty:
                 return pd.DataFrame(), q_terms
         else:
             q_emb = q_vec_full
         n_neighbors = min(50, len(filtered_df))
         if n_neighbors <= 0: return pd.DataFrame(), q_terms
         if isinstance(index_obj, NearestNeighbors):
             # Check if index was fit on placeholder
             if hasattr(index_obj, 'n_samples_fit_') and index_obj.n_samples_fit_ <= 1:
             return "(any)"
         label = df_sum.iloc[row_idx]["label"]
         return label if isinstance(label, str) else "(any)"
     # Click domain summary to filter
     def on_domain_click(evt: gr.SelectData, df_sum: pd.DataFrame):
         try:
         inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
         outputs=[results_df]
     )
     domain_counts_df.select(on_domain_click, inputs=[domain_counts_df], outputs=[domain_drop]) \
     .then(
         refresh_results,