Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Aug 31, 2025

Commit

9ef7e16

verified ·

1 Parent(s): b4b271e

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -171

app.py CHANGED Viewed

@@ -15,27 +15,14 @@ DetectorFactory.seed = 0
 import gradio as gr
 from tqdm import tqdm
-from sentence_transformers import SentenceTransformer
-import faiss
-try:
-    import hdbscan
-    HDBSCAN_AVAILABLE = True
-except Exception:
-    HDBSCAN_AVAILABLE = False
 # ------------------- Helpers -------------------
 URL = re.compile(r"https?://\S+", re.I)
-SKIP_LANGDETECT = True  # CPU-friendly default; can be toggled in the UI
-def torch_cuda_available():
-    try:
-        import torch
-        return torch.cuda.is_available()
-    except Exception:
-        return False
 def html_to_text(html: str) -> str:
     if not html:
@@ -45,7 +32,6 @@ def html_to_text(html: str) -> str:
         tag.decompose()
     return soup.get_text(separator="\n")
 def strip_quotes_and_sigs(text: str) -> str:
     if not text:
         return ""
@@ -54,7 +40,6 @@ def strip_quotes_and_sigs(text: str) -> str:
     res = re.split(r"\nSent from my ", res)[0]
     return res.strip()
 def parse_name_email(s: str) -> Tuple[str, str]:
     if not s:
         return "", ""
@@ -64,7 +49,6 @@ def parse_name_email(s: str) -> Tuple[str, str]:
     return "", s.strip()
 # ------------------- Normalization -------------------
 def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
     subject = raw.get("subject") or raw.get("Subject") or ""
     body_html = raw.get("body_html") or raw.get("html") or ""
@@ -128,134 +112,46 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
         "text_hash": text_hash,
     }
-# ------------------- Embeddings & Clustering -------------------
-def embed_texts(
-    model: SentenceTransformer,
-    texts: List[str],
-    batch_size: int,
-    use_gpu: bool,
-    use_multiprocess: bool = True
-) -> np.ndarray:
-    """
-    Faster CPU path: try multi-process first; fall back to single-process batching.
-    """
-    if not use_gpu and use_multiprocess and (os.cpu_count() or 1) >= 2:
-        try:
-            pool = model.start_multi_process_pool()
-            arr = model.encode_multi_process(texts, pool, normalize_embeddings=True)
-            model.stop_multi_process_pool(pool)
-            return np.asarray(arr, dtype=np.float32)
-        except Exception:
-            pass  # fallback below
-    embs = []
-    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding", leave=False):
-        chunk = texts[i:i + batch_size]
-        embs.append(model.encode(
-            chunk,
-            batch_size=min(batch_size, len(chunk)),
-            show_progress_bar=False,
-            normalize_embeddings=True,
-            convert_to_numpy=True,
-            device="cuda" if use_gpu else "cpu",
-        ))
-    return np.vstack(embs).astype(np.float32)
-def cluster_embeddings(embs: np.ndarray, method: str, min_cluster_size: int, k_hint: int, use_gpu: bool) -> np.ndarray:
-    if method == "HDBSCAN" and HDBSCAN_AVAILABLE:
-        clust = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=max(5, min_cluster_size // 5), metric='euclidean')
-        return clust.fit_predict(embs)
-    k = max(10, k_hint or int(max(20, math.sqrt(len(embs) / 50))))
-    kmeans = faiss.Kmeans(d=embs.shape[1], k=k, niter=25, verbose=False, gpu=use_gpu)
-    kmeans.train(embs)
-    _, labels = kmeans.index.search(embs, 1)
-    return labels.reshape(-1)
-def zero_shot_embed_sim(embs: np.ndarray, model: SentenceTransformer, label_texts: List[str], use_gpu: bool) -> Tuple[np.ndarray, np.ndarray]:
-    prompts = [f"This email is about: {t}" for t in label_texts]
-    label_embs = model.encode(prompts, normalize_embeddings=True, convert_to_numpy=True, device="cuda" if use_gpu else "cpu").astype(np.float32)
-    sims = embs @ label_embs.T
-    top_idx = sims.argmax(axis=1)
-    top_score = sims[np.arange(len(embs)), top_idx]
-    return top_idx, top_score
-# ------------------- Defaults -------------------
-DEFAULT_LABELS = [
-    "Newsletters/Subscriptions",
-    "Receipts & Billing",
-    "Personal/Family",
-    "Work/Colleagues",
-    "Meetings & Calendars",
-    "Travel/Itineraries",
-    "Legal/Contracts",
-    "System Notifications",
-    "Security/2FA",
-    "Hiring/Recruiting",
-    "Support Tickets",
-    "Politics/Government",
-    "Media/Press",
-    "Unknown"
-]
-# ------------------- Search -------------------
-class EmailSearch:
-    def __init__(self, df, embs, model):
-        self.df = df
-        self.embs = embs
-        self.model = model
-        self.index = faiss.IndexFlatIP(embs.shape[1])
-        self.index.add(embs)
-    def query(self, q: str, top_k=20):
-        q_emb = self.model.encode([q], normalize_embeddings=True, convert_to_numpy=True)
-        scores, idx = self.index.search(q_emb.astype(np.float32), top_k)
-        results = self.df.iloc[idx[0]].copy()
-        results["score"] = scores[0]
-        return results
 # ------------------- Gradio UI -------------------
-with gr.Blocks(title="Email Organizer & Browser") as demo:
     gr.Markdown("""
     # Email Organizer & Browser (No-Redaction)
-    Upload a **.jsonl** or **.json** of emails. The app normalizes, deduplicates, embeds, clusters, labels, and lets you **search** your inbox semantically.
-    **CPU mode defaults**: smaller model, CPU multiprocessing, and skipped language detection for speed. You can change these below.
     """)
     with gr.Row():
         inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
     with gr.Row():
-        model_choice = gr.Dropdown(
-            label="Embedding model",
-            choices=[
-                "sentence-transformers/paraphrase-MiniLM-L3-v2",  # fast 384-dim (default)
-                "sentence-transformers/all-MiniLM-L6-v2",         # slower 768-dim
-            ],
-            value="sentence-transformers/paraphrase-MiniLM-L3-v2"
-        )
-        batch_size_in = gr.Number(label="Batch size (CPU)", value=128, precision=0)
-        mp_cpu = gr.Checkbox(label="Use CPU multiprocessing", value=True)
         skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
     run_btn = gr.Button("Process", variant="primary")
     status = gr.Textbox(label="Status", interactive=False)
-    label_counts_df = gr.Dataframe(label="Label counts (by sender domain)", interactive=False)
-    html_samples = gr.HTML(label="Samples")
     with gr.Row():
         search_query = gr.Textbox(label="Search emails (keywords, names, etc.)")
         search_btn = gr.Button("Search")
     search_results = gr.Dataframe(label="Search results", interactive=False)
     state_df = gr.State()
-    state_embs = gr.State()
-    state_model = gr.State()
-    state_search = gr.State()
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
         recs: List[Dict[str, Any]] = []
         if local_path.endswith(".jsonl"):
@@ -277,75 +173,136 @@ with gr.Blocks(title="Email Organizer & Browser") as demo:
                     recs = [obj]
         return recs
-    def process_file(inbox_file, model_choice, batch_size_in, mp_cpu, skip_lang):
         if inbox_file is None:
-            return "Please upload a file", None, None, None, None, None, None
-        # apply fast flags
         global SKIP_LANGDETECT
         SKIP_LANGDETECT = bool(skip_lang)
-        local_path = inbox_file.name
-        recs = _load_json_records(local_path)
         if not recs:
-            return "No valid records found.", None, None, None, None, None, None
-        # Normalize
         normd = [normalize_email_record(r) for r in recs]
         df = pd.DataFrame(normd)
-        # Deduplicate
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
-        # Build texts WITHOUT cap (as requested)
         texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
-        # Model (CPU only for free tier)
-        model = SentenceTransformer(str(model_choice))
-        # Embeddings (CPU multiprocessing optional)
-        embs = embed_texts(
-            model=model,
-            texts=texts,
-            batch_size=int(batch_size_in) if batch_size_in else 128,
-            use_gpu=False,
-            use_multiprocess=bool(mp_cpu),
         )
-        # Build simple domain label counts as a quick organizer view
-        label_counts = df.groupby("from_domain").size().reset_index(name="count").sort_values("count", ascending=False)
-        # Build searcher
-        searcher = EmailSearch(df, embs, model)
-        # Show a small HTML preview of the first 20
-        sample_html = df.head(20)[["date", "from_email", "subject", "body_text"]].to_html(escape=False)
-        return (
-            f"Processed {len(df)} emails with model {model_choice} (dim={embs.shape[1]}).",
-            label_counts,
-            sample_html,
-            df,
-            embs,
-            model,
-            searcher
-        )
     run_btn.click(
         process_file,
-        inputs=[inbox_file, model_choice, batch_size_in, mp_cpu, skip_lang],
-        outputs=[status, label_counts_df, html_samples, state_df, state_embs, state_model, state_search]
     )
-    def search_fn(q, df, embs, model, searcher):
-        if searcher is None or not q:
             return pd.DataFrame()
-        results = searcher.query(q, top_k=20)
-        return results[["date","from_email","subject","body_text","score"]]
     search_btn.click(
         search_fn,
-        inputs=[search_query, state_df, state_embs, state_model, state_search],
         outputs=[search_results]
     )

 import gradio as gr
 from tqdm import tqdm
+# ---- NEW: classic ML (CPU-fast) ----
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import MiniBatchKMeans
+from sklearn.neighbors import NearestNeighbors
 # ------------------- Helpers -------------------
 URL = re.compile(r"https?://\S+", re.I)
+SKIP_LANGDETECT = True  # can be toggled in UI
 def html_to_text(html: str) -> str:
     if not html:
         tag.decompose()
     return soup.get_text(separator="\n")
 def strip_quotes_and_sigs(text: str) -> str:
     if not text:
         return ""
     res = re.split(r"\nSent from my ", res)[0]
     return res.strip()
 def parse_name_email(s: str) -> Tuple[str, str]:
     if not s:
         return "", ""
     return "", s.strip()
 # ------------------- Normalization -------------------
 def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
     subject = raw.get("subject") or raw.get("Subject") or ""
     body_html = raw.get("body_html") or raw.get("html") or ""
         "text_hash": text_hash,
     }
 # ------------------- Gradio UI -------------------
+with gr.Blocks(title="Email Organizer & Browser (TF-IDF + MiniBatchKMeans)") as demo:
     gr.Markdown("""
     # Email Organizer & Browser (No-Redaction)
+    **Engine:** TF-IDF (sparse) + MiniBatchKMeans clustering + cosine NearestNeighbors search.
+    CPU-fast, no text cap. Upload **.jsonl** or **.json**.
     """)
     with gr.Row():
         inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
     with gr.Row():
+        max_features = gr.Number(label="TF-IDF max_features", value=100_000, precision=0)
+        min_df = gr.Number(label="min_df (doc freq ≥)", value=3, precision=0)
+        max_df = gr.Slider(label="max_df (fraction ≤)", minimum=0.1, maximum=0.95, value=0.6, step=0.05)
+        use_bigrams = gr.Checkbox(label="Use bigrams (1–2)", value=True)
         skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
+    with gr.Row():
+        auto_k = gr.Checkbox(label="Auto choose k", value=True)
+        k_clusters = gr.Number(label="k (MiniBatchKMeans)", value=300, precision=0)
+        mb_batch = gr.Number(label="KMeans batch_size", value=4096, precision=0)
     run_btn = gr.Button("Process", variant="primary")
     status = gr.Textbox(label="Status", interactive=False)
+    cluster_counts_df = gr.Dataframe(label="Cluster summary", interactive=False)
+    html_samples = gr.HTML(label="Preview & Domain counts")
     with gr.Row():
         search_query = gr.Textbox(label="Search emails (keywords, names, etc.)")
         search_btn = gr.Button("Search")
     search_results = gr.Dataframe(label="Search results", interactive=False)
+    # States we need for search:
     state_df = gr.State()
+    state_vectorizer = gr.State()
+    state_nn = gr.State()
+    # -------- IO helpers --------
     def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
         recs: List[Dict[str, Any]] = []
         if local_path.endswith(".jsonl"):
                     recs = [obj]
         return recs
+    # -------- Cluster naming --------
+    def top_terms_per_cluster(X, labels, vectorizer, topn=5):
+        """Return dict: cluster_id -> 'term1, term2, ...' using mean TF-IDF weights."""
+        names = vectorizer.get_feature_names_out()
+        out = {}
+        # iterate only over present labels
+        for c in np.unique(labels):
+            mask = (labels == c)
+            if mask.sum() == 0:
+                out[int(c)] = f"cluster_{c}"
+                continue
+            # mean tfidf for cluster c
+            mean_vec = X[mask].mean(axis=0).A1  # to 1D array
+            idx = np.argpartition(mean_vec, -topn)[-topn:]
+            idx = idx[np.argsort(-mean_vec[idx])]
+            terms = [names[i] for i in idx if mean_vec[i] > 0]
+            out[int(c)] = ", ".join(terms) if terms else f"cluster_{c}"
+        return out
+    def auto_k_rule(n_docs: int) -> int:
+        # heuristic: between 150 and 600 depending on corpus size
+        base = int(max(100, min(600, math.sqrt(max(n_docs, 1) / 50.0) * 100)))
+        return base
+    # -------- Main pipeline --------
+    def process_file(inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang, auto_k, k_clusters, mb_batch):
         if inbox_file is None:
+            return "Please upload a file", None, None, None, None, None
+        # fast flags
         global SKIP_LANGDETECT
         SKIP_LANGDETECT = bool(skip_lang)
+        # Load -> normalize
+        recs = _load_json_records(inbox_file.name)
         if not recs:
+            return "No valid records found.", None, None, None, None, None
         normd = [normalize_email_record(r) for r in recs]
         df = pd.DataFrame(normd)
+        # Deduplicate conservatively
         df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
+        # Build texts (no cap)
         texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
+        # TF-IDF
+        ngram_range = (1, 2) if use_bigrams else (1, 1)
+        vec = TfidfVectorizer(
+            analyzer="word",
+            ngram_range=ngram_range,
+            max_features=int(max_features) if max_features else None,
+            min_df=int(min_df) if min_df else 1,
+            max_df=float(max_df) if max_df else 1.0,
+            dtype=np.float32
         )
+        X = vec.fit_transform(texts)
+        # KMeans
+        if bool(auto_k):
+            k = auto_k_rule(X.shape[0])
+        else:
+            k = max(10, int(k_clusters or 300))
+        kmeans = MiniBatchKMeans(
+            n_clusters=k,
+            batch_size=int(mb_batch or 4096),
+            random_state=0,
+            n_init="auto"
+        )
+        labels = kmeans.fit_predict(X)
+        df["cluster_id"] = labels
+        # Auto-name clusters from top terms (and optionally dominant domain)
+        term_names = top_terms_per_cluster(X, labels, vec, topn=5)
+        # Optionally fold dominant sender domain into name when strongly dominant
+        cluster_names = []
+        for c in labels:
+            name = term_names[int(c)]
+            cluster_names.append(name)
+        df["cluster_name"] = cluster_names
+        # Fit cosine NN over TF-IDF for fast search
+        nn = NearestNeighbors(metric="cosine", algorithm="brute")
+        nn.fit(X)
+        # Summaries
+        cluster_counts = (
+            df.groupby(["cluster_id", "cluster_name"]).size()
+              .reset_index(name="count")
+              .sort_values("count", ascending=False)
+        ).head(500)
+        domain_counts = (
+            df.groupby("from_domain").size()
+              .reset_index(name="count")
+              .sort_values("count", ascending=False)
+              .head(50)
+        )
+        # HTML preview: first 20 with cluster tags + domain stats
+        sample_cols = ["date", "from_email", "subject", "cluster_name", "body_text"]
+        preview_html = "<h3>Sample (first 20)</h3>" + df.head(20)[sample_cols].to_html(escape=False, index=False)
+        preview_html += "<br/><h3>Top sender domains</h3>" + domain_counts.to_html(escape=False, index=False)
+        status = f"Processed {len(df)} emails | TF-IDF shape={X.shape} | k={k}"
+        # Keep only what search needs: df, vectorizer, nn (X stays inside nn)
+        return status, cluster_counts, preview_html, df, vec, nn
     run_btn.click(
         process_file,
+        inputs=[inbox_file, max_features, min_df, max_df, use_bigrams, skip_lang, auto_k, k_clusters, mb_batch],
+        outputs=[status, cluster_counts_df, html_samples, state_df, state_vectorizer, state_nn]
     )
+    # -------- Search: cosine NN over TF-IDF --------
+    def search_fn(q, df, vectorizer, nn):
+        if (not q) or (df is None) or (vectorizer is None) or (nn is None):
             return pd.DataFrame()
+        q_vec = vectorizer.transform([q])
+        distances, indices = nn.kneighbors(q_vec, n_neighbors=min(20, len(df)))
+        inds = indices[0]
+        sims = 1.0 - distances[0]  # cosine similarity
+        results = df.iloc[inds].copy()
+        results["score"] = sims
+        return results[["date","from_email","subject","cluster_name","body_text","score"]]
     search_btn.click(
         search_fn,
+        inputs=[search_query, state_df, state_vectorizer, state_nn],
         outputs=[search_results]
     )