Update app.py
Browse files
app.py
CHANGED
|
@@ -1039,7 +1039,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1039 |
with gr.Row():
|
| 1040 |
cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
|
| 1041 |
domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
|
| 1042 |
-
|
| 1043 |
with gr.Row():
|
| 1044 |
sender_counts_df = gr.Dataframe(label="Top senders", interactive=False, wrap=True) # NEW
|
| 1045 |
|
|
@@ -1191,8 +1191,13 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1191 |
Return (X_full_csr, count_vec, char_vec, bm25, d_word, d_char, d_full)
|
| 1192 |
Uses Count+BM25 (+ char-tfidf) or Hashing+TfidfTransformer (+ char-tfidf).
|
| 1193 |
"""
|
|
|
|
| 1194 |
ngram_range = (1, 2) if use_bigrams else (1, 1)
|
| 1195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1196 |
if use_hashing:
|
| 1197 |
# HashingVectorizer -> counts -> TfidfTransformer (as IDF) -> BM25-like not exact.
|
| 1198 |
# We’ll keep TF-IDF for words (IDF approximates) + char-grams TF-IDF (unchanged).
|
|
@@ -1212,7 +1217,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1212 |
|
| 1213 |
char_vec = CharTfidf(
|
| 1214 |
analyzer="char", ngram_range=(3, 5),
|
| 1215 |
-
min_df=
|
| 1216 |
)
|
| 1217 |
X_char = char_vec.fit_transform(texts)
|
| 1218 |
X_full = hstack([X_word, X_char * 0.20], format="csr")
|
|
@@ -1237,7 +1242,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1237 |
|
| 1238 |
char_vec = CharTfidf(
|
| 1239 |
analyzer="char", ngram_range=(3, 5),
|
| 1240 |
-
min_df=
|
| 1241 |
)
|
| 1242 |
X_char = char_vec.fit_transform(texts)
|
| 1243 |
X_full = hstack([X_word, X_char * 0.20], format="csr")
|
|
@@ -1533,7 +1538,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1533 |
index_obj = nn
|
| 1534 |
except Exception:
|
| 1535 |
index_obj = None
|
| 1536 |
-
|
| 1537 |
# Summaries
|
| 1538 |
cluster_counts = (
|
| 1539 |
df[df["cluster_id"] >= 0]
|
|
@@ -1624,7 +1629,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1624 |
domain_update = gr.update(choices=domain_choices, value="(any)")
|
| 1625 |
sender_update = gr.update(choices=sender_choices, value="(any)")
|
| 1626 |
lang_update = gr.update(choices=lang_choices, value="(any)")
|
| 1627 |
-
|
| 1628 |
# NOTE: svd_obj and norm_obj are now local to partitions. We can only pass back the single-partition one for search.
|
| 1629 |
svd_obj_out = svd_obj_local if single_partition and 'svd_obj_local' in locals() else None
|
| 1630 |
norm_obj_out = norm_obj_local if single_partition and 'norm_obj_local' in locals() else None
|
|
@@ -1734,6 +1739,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1734 |
def _vectorize_query(q: str, vec_state: Dict[str, Any], corpus_texts_for_fit: List[str]):
|
| 1735 |
# This has to re-fit a vectorizer to project the query, since partitioning means
|
| 1736 |
# we don't have a single global one.
|
|
|
|
| 1737 |
if vec_state.get("use_hashing"):
|
| 1738 |
hv = HashingVectorizer(
|
| 1739 |
analyzer="word", ngram_range=(1,2) if vec_state.get('use_bigrams') else (1,1),
|
|
@@ -1758,7 +1764,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1758 |
q_word = bm25.transform(q_word_tf)
|
| 1759 |
|
| 1760 |
char_vec = CharTfidf(
|
| 1761 |
-
analyzer="char", ngram_range=(3,5), min_df=
|
| 1762 |
lowercase=True, dtype=np.float32
|
| 1763 |
).fit(corpus_texts_for_fit)
|
| 1764 |
|
|
@@ -1769,8 +1775,8 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1769 |
def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
|
| 1770 |
if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
|
| 1771 |
return pd.DataFrame(), []
|
| 1772 |
-
|
| 1773 |
-
# align with df_main order (exclude -1
|
| 1774 |
mask = ~df["cluster_id"].isin([-1, -2, -3])
|
| 1775 |
filtered_df = df[mask].reset_index(drop=True)
|
| 1776 |
if filtered_df.empty:
|
|
@@ -1785,10 +1791,10 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1785 |
return pd.DataFrame(), q_terms
|
| 1786 |
else:
|
| 1787 |
q_emb = q_vec_full
|
| 1788 |
-
|
| 1789 |
n_neighbors = min(50, len(filtered_df))
|
| 1790 |
if n_neighbors <= 0: return pd.DataFrame(), q_terms
|
| 1791 |
-
|
| 1792 |
if isinstance(index_obj, NearestNeighbors):
|
| 1793 |
# Check if index was fit on placeholder
|
| 1794 |
if hasattr(index_obj, 'n_samples_fit_') and index_obj.n_samples_fit_ <= 1:
|
|
@@ -1874,7 +1880,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1874 |
return "(any)"
|
| 1875 |
label = df_sum.iloc[row_idx]["label"]
|
| 1876 |
return label if isinstance(label, str) else "(any)"
|
| 1877 |
-
|
| 1878 |
# Click domain summary to filter
|
| 1879 |
def on_domain_click(evt: gr.SelectData, df_sum: pd.DataFrame):
|
| 1880 |
try:
|
|
@@ -1895,7 +1901,7 @@ with gr.Blocks(title="Email Investigator (Corruption Focus)", css=CSS, theme="so
|
|
| 1895 |
inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
|
| 1896 |
outputs=[results_df]
|
| 1897 |
)
|
| 1898 |
-
|
| 1899 |
domain_counts_df.select(on_domain_click, inputs=[domain_counts_df], outputs=[domain_drop]) \
|
| 1900 |
.then(
|
| 1901 |
refresh_results,
|
|
|
|
| 1039 |
with gr.Row():
|
| 1040 |
cluster_counts_df = gr.Dataframe(label="Cluster summary (top 500) — click a row to filter", interactive=False, wrap=True)
|
| 1041 |
domain_counts_df = gr.Dataframe(label="Top sender domains", interactive=False, wrap=True)
|
| 1042 |
+
|
| 1043 |
with gr.Row():
|
| 1044 |
sender_counts_df = gr.Dataframe(label="Top senders", interactive=False, wrap=True) # NEW
|
| 1045 |
|
|
|
|
| 1191 |
Return (X_full_csr, count_vec, char_vec, bm25, d_word, d_char, d_full)
|
| 1192 |
Uses Count+BM25 (+ char-tfidf) or Hashing+TfidfTransformer (+ char-tfidf).
|
| 1193 |
"""
|
| 1194 |
+
n_docs = len(texts)
|
| 1195 |
ngram_range = (1, 2) if use_bigrams else (1, 1)
|
| 1196 |
|
| 1197 |
+
# FIX: Adapt min_df for char vectorizer when processing very small partitions (like a single doc)
|
| 1198 |
+
# This prevents the ValueError from sklearn.
|
| 1199 |
+
char_min_df = 1 if n_docs <= 1 else 2
|
| 1200 |
+
|
| 1201 |
if use_hashing:
|
| 1202 |
# HashingVectorizer -> counts -> TfidfTransformer (as IDF) -> BM25-like not exact.
|
| 1203 |
# We’ll keep TF-IDF for words (IDF approximates) + char-grams TF-IDF (unchanged).
|
|
|
|
| 1217 |
|
| 1218 |
char_vec = CharTfidf(
|
| 1219 |
analyzer="char", ngram_range=(3, 5),
|
| 1220 |
+
min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32 # <-- USED HERE
|
| 1221 |
)
|
| 1222 |
X_char = char_vec.fit_transform(texts)
|
| 1223 |
X_full = hstack([X_word, X_char * 0.20], format="csr")
|
|
|
|
| 1242 |
|
| 1243 |
char_vec = CharTfidf(
|
| 1244 |
analyzer="char", ngram_range=(3, 5),
|
| 1245 |
+
min_df=char_min_df, max_features=100_000, lowercase=True, dtype=np.float32 # <-- USED HERE
|
| 1246 |
)
|
| 1247 |
X_char = char_vec.fit_transform(texts)
|
| 1248 |
X_full = hstack([X_word, X_char * 0.20], format="csr")
|
|
|
|
| 1538 |
index_obj = nn
|
| 1539 |
except Exception:
|
| 1540 |
index_obj = None
|
| 1541 |
+
|
| 1542 |
# Summaries
|
| 1543 |
cluster_counts = (
|
| 1544 |
df[df["cluster_id"] >= 0]
|
|
|
|
| 1629 |
domain_update = gr.update(choices=domain_choices, value="(any)")
|
| 1630 |
sender_update = gr.update(choices=sender_choices, value="(any)")
|
| 1631 |
lang_update = gr.update(choices=lang_choices, value="(any)")
|
| 1632 |
+
|
| 1633 |
# NOTE: svd_obj and norm_obj are now local to partitions. We can only pass back the single-partition one for search.
|
| 1634 |
svd_obj_out = svd_obj_local if single_partition and 'svd_obj_local' in locals() else None
|
| 1635 |
norm_obj_out = norm_obj_local if single_partition and 'norm_obj_local' in locals() else None
|
|
|
|
| 1739 |
def _vectorize_query(q: str, vec_state: Dict[str, Any], corpus_texts_for_fit: List[str]):
|
| 1740 |
# This has to re-fit a vectorizer to project the query, since partitioning means
|
| 1741 |
# we don't have a single global one.
|
| 1742 |
+
char_min_df = 1 if len(corpus_texts_for_fit) <=1 else 2
|
| 1743 |
if vec_state.get("use_hashing"):
|
| 1744 |
hv = HashingVectorizer(
|
| 1745 |
analyzer="word", ngram_range=(1,2) if vec_state.get('use_bigrams') else (1,1),
|
|
|
|
| 1764 |
q_word = bm25.transform(q_word_tf)
|
| 1765 |
|
| 1766 |
char_vec = CharTfidf(
|
| 1767 |
+
analyzer="char", ngram_range=(3,5), min_df=char_min_df, max_features=100_000,
|
| 1768 |
lowercase=True, dtype=np.float32
|
| 1769 |
).fit(corpus_texts_for_fit)
|
| 1770 |
|
|
|
|
| 1775 |
def search_fn(q, df, vec_state, X_reduced, index_obj, use_lsa_flag, use_faiss_flag, svd_obj, norm_obj, sort_by, sort_dir):
|
| 1776 |
if (not q) or (df is None) or (vec_state is None) or (index_obj is None):
|
| 1777 |
return pd.DataFrame(), []
|
| 1778 |
+
|
| 1779 |
+
# align with df_main order (exclude -1, -2, -3)
|
| 1780 |
mask = ~df["cluster_id"].isin([-1, -2, -3])
|
| 1781 |
filtered_df = df[mask].reset_index(drop=True)
|
| 1782 |
if filtered_df.empty:
|
|
|
|
| 1791 |
return pd.DataFrame(), q_terms
|
| 1792 |
else:
|
| 1793 |
q_emb = q_vec_full
|
| 1794 |
+
|
| 1795 |
n_neighbors = min(50, len(filtered_df))
|
| 1796 |
if n_neighbors <= 0: return pd.DataFrame(), q_terms
|
| 1797 |
+
|
| 1798 |
if isinstance(index_obj, NearestNeighbors):
|
| 1799 |
# Check if index was fit on placeholder
|
| 1800 |
if hasattr(index_obj, 'n_samples_fit_') and index_obj.n_samples_fit_ <= 1:
|
|
|
|
| 1880 |
return "(any)"
|
| 1881 |
label = df_sum.iloc[row_idx]["label"]
|
| 1882 |
return label if isinstance(label, str) else "(any)"
|
| 1883 |
+
|
| 1884 |
# Click domain summary to filter
|
| 1885 |
def on_domain_click(evt: gr.SelectData, df_sum: pd.DataFrame):
|
| 1886 |
try:
|
|
|
|
| 1901 |
inputs=[state_df, cluster_drop, domain_drop, sender_drop, lang_drop, sentiment_drop, tag_drop, date_start, date_end, sort_by, sort_dir],
|
| 1902 |
outputs=[results_df]
|
| 1903 |
)
|
| 1904 |
+
|
| 1905 |
domain_counts_df.select(on_domain_click, inputs=[domain_counts_df], outputs=[domain_drop]) \
|
| 1906 |
.then(
|
| 1907 |
refresh_results,
|