wuhp commited on
Commit
b4b271e
·
verified ·
1 Parent(s): ca09808

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -20
app.py CHANGED
@@ -26,6 +26,7 @@ except Exception:
26
 
27
  # ------------------- Helpers -------------------
28
  URL = re.compile(r"https?://\S+", re.I)
 
29
 
30
 
31
  def torch_cuda_available():
@@ -83,9 +84,12 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
83
  body_text = re.sub(r"\s+", " ", body_text).strip()
84
  subject_norm = re.sub(r"\s+", " ", subject)
85
 
86
- try:
87
- lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
88
- except Exception:
 
 
 
89
  lang = "unknown"
90
 
91
  from_name, from_email = parse_name_email(sender)
@@ -126,13 +130,31 @@ def normalize_email_record(raw: Dict[str, Any]) -> Dict[str, Any]:
126
 
127
  # ------------------- Embeddings & Clustering -------------------
128
 
129
- def embed_texts(model: SentenceTransformer, texts: List[str], batch_size: int, use_gpu: bool) -> np.ndarray:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  embs = []
131
  for i in tqdm(range(0, len(texts), batch_size), desc="Embedding", leave=False):
132
  chunk = texts[i:i + batch_size]
133
  embs.append(model.encode(
134
  chunk,
135
- batch_size=min(256, len(chunk)),
136
  show_progress_bar=False,
137
  normalize_embeddings=True,
138
  convert_to_numpy=True,
@@ -199,14 +221,29 @@ with gr.Blocks(title="Email Organizer & Browser") as demo:
199
  gr.Markdown("""
200
  # Email Organizer & Browser (No-Redaction)
201
  Upload a **.jsonl** or **.json** of emails. The app normalizes, deduplicates, embeds, clusters, labels, and lets you **search** your inbox semantically.
 
 
202
  """)
203
 
204
  with gr.Row():
205
  inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  run_btn = gr.Button("Process", variant="primary")
208
  status = gr.Textbox(label="Status", interactive=False)
209
- label_counts_df = gr.Dataframe(label="Label counts", interactive=False)
210
  html_samples = gr.HTML(label="Samples")
211
 
212
  with gr.Row():
@@ -219,17 +256,17 @@ with gr.Blocks(title="Email Organizer & Browser") as demo:
219
  state_model = gr.State()
220
  state_search = gr.State()
221
 
222
- def process_file(inbox_file):
223
- if inbox_file is None:
224
- return "Please upload a file", None, None, None, None, None, None
225
- local_path = inbox_file.name
226
- recs = []
227
  if local_path.endswith(".jsonl"):
228
  with open(local_path, "r", encoding="utf-8") as fh:
229
  for line in fh:
 
 
 
230
  try:
231
  recs.append(json.loads(line))
232
- except:
233
  continue
234
  else:
235
  with open(local_path, "r", encoding="utf-8") as fh:
@@ -238,24 +275,70 @@ with gr.Blocks(title="Email Organizer & Browser") as demo:
238
  recs = obj
239
  elif isinstance(obj, dict):
240
  recs = [obj]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  normd = [normalize_email_record(r) for r in recs]
242
  df = pd.DataFrame(normd)
243
- df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"])
244
- texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("").str.slice(0,2000)).tolist()
245
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
246
- embs = embed_texts(model, texts, 512, torch_cuda_available())
247
- searcher = EmailSearch(df, embs, model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  label_counts = df.groupby("from_domain").size().reset_index(name="count").sort_values("count", ascending=False)
249
- return f"Processed {len(df)} emails", label_counts, df.head(20).to_html(), df, embs, model, searcher
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
  run_btn.click(
252
  process_file,
253
- inputs=[inbox_file],
254
  outputs=[status, label_counts_df, html_samples, state_df, state_embs, state_model, state_search]
255
  )
256
 
257
  def search_fn(q, df, embs, model, searcher):
258
- if searcher is None:
259
  return pd.DataFrame()
260
  results = searcher.query(q, top_k=20)
261
  return results[["date","from_email","subject","body_text","score"]]
 
26
 
27
  # ------------------- Helpers -------------------
28
  URL = re.compile(r"https?://\S+", re.I)
29
+ SKIP_LANGDETECT = True # CPU-friendly default; can be toggled in the UI
30
 
31
 
32
  def torch_cuda_available():
 
84
  body_text = re.sub(r"\s+", " ", body_text).strip()
85
  subject_norm = re.sub(r"\s+", " ", subject)
86
 
87
+ if not SKIP_LANGDETECT:
88
+ try:
89
+ lang = detect((subject_norm + " " + body_text[:5000]).strip()) if (subject_norm or body_text) else "unknown"
90
+ except Exception:
91
+ lang = "unknown"
92
+ else:
93
  lang = "unknown"
94
 
95
  from_name, from_email = parse_name_email(sender)
 
130
 
131
  # ------------------- Embeddings & Clustering -------------------
132
 
133
+ def embed_texts(
134
+ model: SentenceTransformer,
135
+ texts: List[str],
136
+ batch_size: int,
137
+ use_gpu: bool,
138
+ use_multiprocess: bool = True
139
+ ) -> np.ndarray:
140
+ """
141
+ Faster CPU path: try multi-process first; fall back to single-process batching.
142
+ """
143
+ if not use_gpu and use_multiprocess and (os.cpu_count() or 1) >= 2:
144
+ try:
145
+ pool = model.start_multi_process_pool()
146
+ arr = model.encode_multi_process(texts, pool, normalize_embeddings=True)
147
+ model.stop_multi_process_pool(pool)
148
+ return np.asarray(arr, dtype=np.float32)
149
+ except Exception:
150
+ pass # fallback below
151
+
152
  embs = []
153
  for i in tqdm(range(0, len(texts), batch_size), desc="Embedding", leave=False):
154
  chunk = texts[i:i + batch_size]
155
  embs.append(model.encode(
156
  chunk,
157
+ batch_size=min(batch_size, len(chunk)),
158
  show_progress_bar=False,
159
  normalize_embeddings=True,
160
  convert_to_numpy=True,
 
221
  gr.Markdown("""
222
  # Email Organizer & Browser (No-Redaction)
223
  Upload a **.jsonl** or **.json** of emails. The app normalizes, deduplicates, embeds, clusters, labels, and lets you **search** your inbox semantically.
224
+
225
+ **CPU mode defaults**: smaller model, CPU multiprocessing, and skipped language detection for speed. You can change these below.
226
  """)
227
 
228
  with gr.Row():
229
  inbox_file = gr.File(label="Upload emails (.jsonl or .json)", file_types=[".jsonl", ".json"])
230
 
231
+ with gr.Row():
232
+ model_choice = gr.Dropdown(
233
+ label="Embedding model",
234
+ choices=[
235
+ "sentence-transformers/paraphrase-MiniLM-L3-v2", # fast 384-dim (default)
236
+ "sentence-transformers/all-MiniLM-L6-v2", # slower 768-dim
237
+ ],
238
+ value="sentence-transformers/paraphrase-MiniLM-L3-v2"
239
+ )
240
+ batch_size_in = gr.Number(label="Batch size (CPU)", value=128, precision=0)
241
+ mp_cpu = gr.Checkbox(label="Use CPU multiprocessing", value=True)
242
+ skip_lang = gr.Checkbox(label="Skip language detection (faster)", value=True)
243
+
244
  run_btn = gr.Button("Process", variant="primary")
245
  status = gr.Textbox(label="Status", interactive=False)
246
+ label_counts_df = gr.Dataframe(label="Label counts (by sender domain)", interactive=False)
247
  html_samples = gr.HTML(label="Samples")
248
 
249
  with gr.Row():
 
256
  state_model = gr.State()
257
  state_search = gr.State()
258
 
259
+ def _load_json_records(local_path: str) -> List[Dict[str, Any]]:
260
+ recs: List[Dict[str, Any]] = []
 
 
 
261
  if local_path.endswith(".jsonl"):
262
  with open(local_path, "r", encoding="utf-8") as fh:
263
  for line in fh:
264
+ line = line.strip()
265
+ if not line:
266
+ continue
267
  try:
268
  recs.append(json.loads(line))
269
+ except Exception:
270
  continue
271
  else:
272
  with open(local_path, "r", encoding="utf-8") as fh:
 
275
  recs = obj
276
  elif isinstance(obj, dict):
277
  recs = [obj]
278
+ return recs
279
+
280
+ def process_file(inbox_file, model_choice, batch_size_in, mp_cpu, skip_lang):
281
+ if inbox_file is None:
282
+ return "Please upload a file", None, None, None, None, None, None
283
+
284
+ # apply fast flags
285
+ global SKIP_LANGDETECT
286
+ SKIP_LANGDETECT = bool(skip_lang)
287
+
288
+ local_path = inbox_file.name
289
+ recs = _load_json_records(local_path)
290
+ if not recs:
291
+ return "No valid records found.", None, None, None, None, None, None
292
+
293
+ # Normalize
294
  normd = [normalize_email_record(r) for r in recs]
295
  df = pd.DataFrame(normd)
296
+
297
+ # Deduplicate
298
+ df = df.drop_duplicates(subset=["message_id", "subject", "text_hash"]).reset_index(drop=True)
299
+
300
+ # Build texts WITHOUT cap (as requested)
301
+ texts = (df["subject"].fillna("") + "\n\n" + df["body_text"].fillna("")).tolist()
302
+
303
+ # Model (CPU only for free tier)
304
+ model = SentenceTransformer(str(model_choice))
305
+
306
+ # Embeddings (CPU multiprocessing optional)
307
+ embs = embed_texts(
308
+ model=model,
309
+ texts=texts,
310
+ batch_size=int(batch_size_in) if batch_size_in else 128,
311
+ use_gpu=False,
312
+ use_multiprocess=bool(mp_cpu),
313
+ )
314
+
315
+ # Build simple domain label counts as a quick organizer view
316
  label_counts = df.groupby("from_domain").size().reset_index(name="count").sort_values("count", ascending=False)
317
+
318
+ # Build searcher
319
+ searcher = EmailSearch(df, embs, model)
320
+
321
+ # Show a small HTML preview of the first 20
322
+ sample_html = df.head(20)[["date", "from_email", "subject", "body_text"]].to_html(escape=False)
323
+
324
+ return (
325
+ f"Processed {len(df)} emails with model {model_choice} (dim={embs.shape[1]}).",
326
+ label_counts,
327
+ sample_html,
328
+ df,
329
+ embs,
330
+ model,
331
+ searcher
332
+ )
333
 
334
  run_btn.click(
335
  process_file,
336
+ inputs=[inbox_file, model_choice, batch_size_in, mp_cpu, skip_lang],
337
  outputs=[status, label_counts_df, html_samples, state_df, state_embs, state_model, state_search]
338
  )
339
 
340
  def search_fn(q, df, embs, model, searcher):
341
+ if searcher is None or not q:
342
  return pd.DataFrame()
343
  results = searcher.query(q, top_k=20)
344
  return results[["date","from_email","subject","body_text","score"]]