Spaces:

wuhp
/

testmail2

Sleeping

App Files Files Community

wuhp commited on Aug 31, 2025

Commit

b4b271e

verified ·

1 Parent(s): ca09808

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -20

app.py CHANGED Viewed

@@ -26,6 +26,7 @@ except Exception:
 # ------------------- Helpers -------------------
 URL = re.compile(r"https?://\S+", re.I)
 def torch_cuda_available():
@@ -83,9 +84,12 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
     body_text = re.sub(r"\s+", " ", body_text).strip()
     subject_norm = re.sub(r"\s+", " ", subject)
-    try:
-        lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
-    except Exception:
         lang = "unknown"
     from_name, from_email = parse_name_email(sender)
@@ -126,13 +130,31 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
 # ------------------- Embeddings & Clustering -------------------
-def embed_texts(model: SentenceTransformer, texts: List[str], batch_size: int, use_gpu: bool) -> np.ndarray:
     embs = []
     for i in tqdm(range(0, len(texts), batch_size), desc="Embedding", leave=False):
         chunk = texts[i:i + batch_size]
         embs.append(model.encode(
             chunk,
-            batch_size=min(256, len(chunk)),
             show_progress_bar=False,
             normalize_embeddings=True,
             convert_to_numpy=True,
@@ -199,14 +221,29 @@ with gr.Blocks(title="Email Organizer & Browser") as demo:
     gr.Markdown("""
     # Email Organizer & Browser (No-Redaction)
     Upload a **.jsonl** or **.json** of emails. The app normalizes, deduplicates, embeds, clusters, labels, and lets you **search** your inbox semantically.
     """)
     with gr.Row():
         inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
     run_btn = gr.Button("Process", variant="primary")
     status = gr.Textbox(label="Status", interactive=False)
-    label_counts_df = gr.Dataframe(label="Label counts", interactive=False)
     html_samples = gr.HTML(label="Samples")
     with gr.Row():
@@ -219,17 +256,17 @@ with gr.Blocks(title="Email Organizer & Browser") as demo:
     state_model = gr.State()
     state_search = gr.State()
-    def process_file(inbox_file):
-        if inbox_file is None:
-            return "Please upload a file", None, None, None, None, None, None
-        local_path = inbox_file.name
-        recs = []
         if local_path.endswith(".jsonl"):
             with open(local_path, "r", encoding="utf-8") as fh:
                 for line in fh:
                     try:
                         recs.append(json.loads(line))
-                    except:
                         continue
         else:
             with open(local_path, "r", encoding="utf-8") as fh:
@@ -238,24 +275,70 @@ with gr.Blocks(title="Email Organizer & Browser") as demo:
                     recs = obj
                 elif isinstance(obj, dict):
                     recs = [obj]
         normd = [normalize_email_record(r) for r in recs]
         df = pd.DataFrame(normd)
-        df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"])
-        texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("").str.slice(0,2000)).tolist()
-        model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-        embs = embed_texts(model, texts, 512, torch_cuda_available())
-        searcher = EmailSearch(df, embs, model)
         label_counts = df.groupby("from_domain").size().reset_index(name="count").sort_values("count", ascending=False)
-        return f"Processed {len(df)} emails", label_counts, df.head(20).to_html(), df, embs, model, searcher
     run_btn.click(
         process_file,
-        inputs=[inbox_file],
         outputs=[status, label_counts_df, html_samples, state_df, state_embs, state_model, state_search]
     )
     def search_fn(q, df, embs, model, searcher):
-        if searcher is None:
             return pd.DataFrame()
         results = searcher.query(q, top_k=20)
         return results[["date","from_email","subject","body_text","score"]]

 # ------------------- Helpers -------------------
 URL = re.compile(r"https?://\S+", re.I)
+SKIP_LANGDETECT = True  # CPU-friendly default; can be toggled in the UI
 def torch_cuda_available():
     body_text = re.sub(r"\s+", " ", body_text).strip()
     subject_norm = re.sub(r"\s+", " ", subject)
+    if not SKIP_LANGDETECT:
+        try:
+            lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
+        except Exception:
+            lang = "unknown"
+    else:
         lang = "unknown"
     from_name, from_email = parse_name_email(sender)
 # ------------------- Embeddings & Clustering -------------------
+def embed_texts(
+    model: SentenceTransformer,
+    texts: List[str],
+    batch_size: int,
+    use_gpu: bool,
+    use_multiprocess: bool = True
+) -> np.ndarray:
+    """
+    Faster CPU path: try multi-process first; fall back to single-process batching.
+    """
+    if not use_gpu and use_multiprocess and (os.cpu_count() or 1) >= 2:
+        try:
+            pool = model.start_multi_process_pool()
+            arr = model.encode_multi_process(texts, pool, normalize_embeddings=True)
+            model.stop_multi_process_pool(pool)
+            return np.asarray(arr, dtype=np.float32)
+        except Exception:
+            pass  # fallback below
     embs = []
     for i in tqdm(range(0, len(texts), batch_size), desc="Embedding", leave=False):
         chunk = texts[i:i + batch_size]
         embs.append(model.encode(
             chunk,
+            batch_size=min(batch_size, len(chunk)),
             show_progress_bar=False,
             normalize_embeddings=True,
             convert_to_numpy=True,
     gr.Markdown("""
     # Email Organizer & Browser (No-Redaction)
     Upload a **.jsonl** or **.json** of emails. The app normalizes, deduplicates, embeds, clusters, labels, and lets you **search** your inbox semantically.
+    **CPU mode defaults**: smaller model, CPU multiprocessing, and skipped language detection for speed. You can change these below.
     """)
     with gr.Row():
         inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
+    with gr.Row():
+        model_choice = gr.Dropdown(
+            label="Embedding model",
+            choices=[
+                "sentence-transformers/paraphrase-MiniLM-L3-v2",  # fast 384-dim (default)
+                "sentence-transformers/all-MiniLM-L6-v2",         # slower 768-dim
+            ],
+            value="sentence-transformers/paraphrase-MiniLM-L3-v2"
+        )
+        batch_size_in = gr.Number(label="Batch size (CPU)", value=128, precision=0)
+        mp_cpu = gr.Checkbox(label="Use CPU multiprocessing", value=True)
+        skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
     run_btn = gr.Button("Process", variant="primary")
     status = gr.Textbox(label="Status", interactive=False)
+    label_counts_df = gr.Dataframe(label="Label counts (by sender domain)", interactive=False)
     html_samples = gr.HTML(label="Samples")
     with gr.Row():
     state_model = gr.State()
     state_search = gr.State()
+    def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
+        recs: List[Dict[str, Any]] = []
         if local_path.endswith(".jsonl"):
             with open(local_path, "r", encoding="utf-8") as fh:
                 for line in fh:
+                    line = line.strip()
+                    if not line:
+                        continue
                     try:
                         recs.append(json.loads(line))
+                    except Exception:
                         continue
         else:
             with open(local_path, "r", encoding="utf-8") as fh:
                     recs = obj
                 elif isinstance(obj, dict):
                     recs = [obj]
+        return recs
+    def process_file(inbox_file, model_choice, batch_size_in, mp_cpu, skip_lang):
+        if inbox_file is None:
+            return "Please upload a file", None, None, None, None, None, None
+        # apply fast flags
+        global SKIP_LANGDETECT
+        SKIP_LANGDETECT = bool(skip_lang)
+        local_path = inbox_file.name
+        recs = _load_json_records(local_path)
+        if not recs:
+            return "No valid records found.", None, None, None, None, None, None
+        # Normalize
         normd = [normalize_email_record(r) for r in recs]
         df = pd.DataFrame(normd)
+        # Deduplicate
+        df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
+        # Build texts WITHOUT cap (as requested)
+        texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
+        # Model (CPU only for free tier)
+        model = SentenceTransformer(str(model_choice))
+        # Embeddings (CPU multiprocessing optional)
+        embs = embed_texts(
+            model=model,
+            texts=texts,
+            batch_size=int(batch_size_in) if batch_size_in else 128,
+            use_gpu=False,
+            use_multiprocess=bool(mp_cpu),
+        )
+        # Build simple domain label counts as a quick organizer view
         label_counts = df.groupby("from_domain").size().reset_index(name="count").sort_values("count", ascending=False)
+        # Build searcher
+        searcher = EmailSearch(df, embs, model)
+        # Show a small HTML preview of the first 20
+        sample_html = df.head(20)[["date", "from_email", "subject", "body_text"]].to_html(escape=False)
+        return (
+            f"Processed {len(df)} emails with model {model_choice} (dim={embs.shape[1]}).",
+            label_counts,
+            sample_html,
+            df,
+            embs,
+            model,
+            searcher
+        )
     run_btn.click(
         process_file,
+        inputs=[inbox_file, model_choice, batch_size_in, mp_cpu, skip_lang],
         outputs=[status, label_counts_df, html_samples, state_df, state_embs, state_model, state_search]
     )
     def search_fn(q, df, embs, model, searcher):
+        if searcher is None or not q:
             return pd.DataFrame()
         results = searcher.query(q, top_k=20)
         return results[["date","from_email","subject","body_text","score"]]