sonuprasad23 commited on
Commit
8a3c8d8
·
1 Parent(s): 9624e78

Project Uploaded

Browse files
Files changed (3) hide show
  1. Dockerfile +15 -18
  2. api_server.py +200 -92
  3. final5.py +89 -55
Dockerfile CHANGED
@@ -1,44 +1,41 @@
1
- # Use a base image with Python
2
  FROM python:3.10-slim
3
 
4
- # Set the working directory
5
  WORKDIR /app
6
 
7
- # Set environment variables to prevent interactive prompts during installation
8
  ENV DEBIAN_FRONTEND=noninteractive
9
  ENV PYTHONUNBUFFERED=1
10
  ENV PYTHONIOENCODING=utf-8
11
  ENV FLASK_APP=api_server.py
12
  ENV FLASK_RUN_HOST=0.0.0.0
13
  ENV FLASK_RUN_PORT=7860
 
 
 
 
14
 
15
- # Install system dependencies needed for apt-add-repository and Chrome
16
  RUN apt-get update && apt-get install -y \
17
  wget \
18
  gnupg \
19
  curl \
20
  unzip \
 
 
21
  && rm -rf /var/lib/apt/lists/*
22
 
23
- # --- START: CORRECTED CHROME INSTALLATION ---
24
- # This is the modern, secure way to add repository keys.
25
- RUN curl -sS -o - https://dl.google.com/linux/linux_signing_key.pub | gpg --dearmor | tee /etc/apt/keyrings/google-chrome.gpg >/dev/null \
26
- && echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
27
- && apt-get update \
28
- && apt-get install -y google-chrome-stable \
29
- && rm -rf /var/lib/apt/lists/*
30
- # --- END: CORRECTED CHROME INSTALLATION ---
31
 
32
- # Copy the requirements file and install Python dependencies
33
  COPY requirements.txt .
34
  RUN pip install --no-cache-dir -r requirements.txt
35
 
36
- # Copy the rest of the application files
37
  COPY . .
38
 
39
- # Expose the port the app runs on (Hugging Face uses 7860 by default)
 
40
  EXPOSE 7860
41
 
42
- # Command to run the Flask application
43
- # Use gunicorn for a more robust server in production if needed, but flask dev server is fine for spaces
44
- CMD ["flask", "run", "--host=0.0.0.0", "--port=7860"]
 
 
1
  FROM python:3.10-slim
2
 
 
3
  WORKDIR /app
4
 
 
5
  ENV DEBIAN_FRONTEND=noninteractive
6
  ENV PYTHONUNBUFFERED=1
7
  ENV PYTHONIOENCODING=utf-8
8
  ENV FLASK_APP=api_server.py
9
  ENV FLASK_RUN_HOST=0.0.0.0
10
  ENV FLASK_RUN_PORT=7860
11
+ ENV HOME=/tmp
12
+ ENV WDM_LOCAL=1
13
+ ENV WDM_CACHE_DIR=/tmp/.wdm
14
+ ENV SE_MANAGER_DRIVER_CACHE=/tmp/selenium
15
 
 
16
  RUN apt-get update && apt-get install -y \
17
  wget \
18
  gnupg \
19
  curl \
20
  unzip \
21
+ ca-certificates \
22
+ fonts-liberation \
23
  && rm -rf /var/lib/apt/lists/*
24
 
25
+ RUN mkdir -p /etc/apt/keyrings && \
26
+ curl -sS -o - https://dl.google.com/linux/linux_signing_key.pub | gpg --dearmor | tee /etc/apt/keyrings/google-chrome.gpg >/dev/null && \
27
+ echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
28
+ apt-get update && \
29
+ apt-get install -y google-chrome-stable && \
30
+ rm -rf /var/lib/apt/lists/*
 
 
31
 
 
32
  COPY requirements.txt .
33
  RUN pip install --no-cache-dir -r requirements.txt
34
 
 
35
  COPY . .
36
 
37
+ RUN mkdir -p /tmp/.wdm /tmp/selenium
38
+
39
  EXPOSE 7860
40
 
41
+ CMD ["flask", "run", "--host=0.0.0.0", "--port=7860"]
 
 
api_server.py CHANGED
@@ -1,4 +1,14 @@
1
- import os, re, json, time, base64, pickle, subprocess, threading, traceback, html, binascii
 
 
 
 
 
 
 
 
 
 
2
  from datetime import datetime
3
  from dataclasses import dataclass, field
4
  from typing import List, Dict, Any, Optional
@@ -11,9 +21,6 @@ from dotenv import load_dotenv
11
 
12
  load_dotenv()
13
 
14
- # --- START: CRITICAL DEFINITIONS ---
15
- # This block is moved to the top to guarantee 'log' exists before it's ever called.
16
-
17
  class LogBuffer:
18
  def __init__(self, max_items: int = 10000):
19
  self._buf: List[Dict[str, Any]] = []
@@ -24,57 +31,57 @@ class LogBuffer:
24
  ts = datetime.now().strftime("%H:%M:%S")
25
  line = {"id": self._next_id, "ts": ts, "level": level, "source": source, "msg": msg}
26
  with self._lock:
27
- self._buf.append(line); self._next_id += 1
28
- if len(self._buf) > self._max: self._buf = self._buf[-self._max:]
 
 
29
  def clear(self):
30
- with self._lock: self._buf.clear()
 
31
  def get_after(self, after_id: int, limit: int = 500):
32
  with self._lock:
33
- if after_id <= 0: data = self._buf[-limit:]
34
- else: data = [x for x in self._buf if x["id"] > after_id][:limit]
 
 
35
  last_id = self._buf[-1]["id"] if self._buf else after_id
36
  return data, last_id
37
 
38
  logs = LogBuffer()
 
39
  def log(msg: str, level: str = "info", source: str = "server"):
40
  logs.append(msg, level, source)
41
  print(f"[{level.upper()}][{source}] {msg}", flush=True)
42
 
43
  def decode_base64_with_padding(b64_string: str) -> bytes:
44
- """Decodes a Base64 string, adding missing padding if necessary."""
45
  missing_padding = len(b64_string) % 4
46
  if missing_padding:
47
- b64_string += '=' * (4 - missing_padding)
48
  try:
49
  return base64.b64decode(b64_string)
50
  except binascii.Error as e:
51
  log(f"Error decoding base64 string: {e}", "error", "SERVER")
52
  return b""
53
- # --- END: CRITICAL DEFINITIONS ---
54
 
55
-
56
- # Define a writable directory for ALL runtime files
57
  WRITABLE_DIR = "/tmp"
58
  COOKIES_PATH = os.path.join(WRITABLE_DIR, "facebook_cookies.pkl")
59
  SERVICE_ACCOUNT_FILE = os.path.join(WRITABLE_DIR, "service_account.json")
60
 
61
- # Decode secrets at startup into the /tmp directory
62
- if 'FB_COOKIES_B64' in os.environ:
63
- decoded_cookies = decode_base64_with_padding(os.environ['FB_COOKIES_B64'])
64
  if decoded_cookies:
65
- with open(COOKIES_PATH, 'wb') as f:
66
  f.write(decoded_cookies)
67
 
68
- if 'SERVICE_ACCOUNT_B64' in os.environ:
69
- decoded_service_account = decode_base64_with_padding(os.environ['SERVICE_ACCOUNT_B64'])
70
  if decoded_service_account:
71
- with open(SERVICE_ACCOUNT_FILE, 'w') as f:
72
- f.write(decoded_service_account.decode('utf-8'))
73
 
74
- # Define global constants
75
- GROUPS_TXT = os.environ.get("GROUPS_TXT", "groups.txt")
76
- FINAL5_PATH = os.environ.get("FINAL5_PATH", "final5.py")
77
- PYTHON_BIN = os.environ.get("PYTHON_BIN", "python")
78
  SENDER_EMAIL = os.environ.get("SENDER_EMAIL", "smahato@hillsidemedicalgroup.com")
79
 
80
  SCRAPE_OUTDIR = os.path.join(WRITABLE_DIR, "scraped")
@@ -86,20 +93,18 @@ for i in range(1, 6):
86
  if key:
87
  GEMINI_KEYS.append(key)
88
 
89
- GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
90
  os.makedirs(SCRAPE_OUTDIR, exist_ok=True)
91
  os.makedirs(ANALYSIS_OUTDIR, exist_ok=True)
92
 
93
-
94
- # Define the Gmail service builder function
95
  def build_gmail_service():
96
  if not os.path.exists(SERVICE_ACCOUNT_FILE):
97
  log("Service account file not found, Gmail unavailable.", "error", "GMAIL")
98
  return None
99
  try:
100
  creds = service_account.Credentials.from_service_account_file(
101
- SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(SENDER_EMAIL)
102
-
103
  service = build("gmail", "v1", credentials=creds)
104
  log("Gmail service built successfully using service account.", "info", "GMAIL")
105
  return service
@@ -108,11 +113,8 @@ def build_gmail_service():
108
  log(f"CRITICAL: Ensure your service account has Domain-Wide Delegation enabled for the user {SENDER_EMAIL}", "error", "GMAIL")
109
  return None
110
 
111
- # Now that all setup is done, build the service
112
  gmail_service = build_gmail_service()
113
 
114
-
115
- # --- The rest of the application code follows ---
116
  @dataclass
117
  class GroupRun:
118
  link: str
@@ -135,7 +137,7 @@ class PipelineState:
135
  recipients: List[str] = field(default_factory=list)
136
  summary_path: str = ""
137
 
138
- app = Flask(__name__, static_folder='.', static_url_path='')
139
  CORS(app)
140
 
141
  live_lock = threading.Lock()
@@ -172,16 +174,19 @@ def load_scraped_into_live(path: str):
172
  live_state["counts"]["total_posts"] = len(posts)
173
 
174
  def handle_event_line(line: str):
175
- if not line.startswith("::"): return
 
176
  try:
177
  if "::SCRAPE_SAVED::" in line:
178
  path = line.split("::SCRAPE_SAVED::", 1)[1].strip()
179
- if path: load_scraped_into_live(path)
 
180
  elif "::KW_HIT::" in line:
181
  d = json.loads(line.split("::KW_HIT::", 1)[1].strip())
182
  p = ensure_post_obj(int(d["id"]))
183
  p["found_keywords"] = d.get("found_keywords", [])
184
- with live_lock: live_state["counts"]["kw_hits"] += 1
 
185
  elif "::AI_RESULT::" in line:
186
  d = json.loads(line.split("::AI_RESULT::", 1)[1].strip())
187
  p = ensure_post_obj(int(d["id"]))
@@ -189,19 +194,22 @@ def handle_event_line(line: str):
189
  p["ai"] = ai
190
  with live_lock:
191
  live_state["counts"]["ai_done"] += 1
192
- if ai.get("is_medical_seeking"): live_state["counts"]["confirmed"] += 1
 
193
  elif "::EMAIL_SENT::" in line:
194
  d = json.loads(line.split("::EMAIL_SENT::", 1)[1].strip())
195
  p = ensure_post_obj(int(d["id"]))
196
  sent = int(d.get("sent", 0))
197
  p["email_sent"] = sent > 0
198
  if sent > 0:
199
- with live_lock: live_state["counts"]["emails"] += sent
 
200
  except Exception as e:
201
  log(f"live parse error: {e}", "error", "LIVE")
202
 
203
  def read_groups(path: str) -> List[str]:
204
- if not os.path.exists(path): return []
 
205
  with open(path, "r", encoding="utf-8") as f:
206
  return [ln.strip() for ln in f.read().splitlines() if ln.strip()]
207
 
@@ -232,46 +240,103 @@ def send_html_email(to_emails: List[str], subject: str, html_content: str) -> in
232
  log(f"Gmail send error to {to}: {e}", "error", "gmail")
233
  return sent
234
 
235
- def build_confirmed_posts_email(groups_run: List[GroupRun], all_confirmed_posts: List[Dict[str, Any]]) -> str:
236
- total_groups, total_scraped, total_confirmed = len(groups_run), sum(g.scraped_posts for g in groups_run), len(all_confirmed_posts)
237
- table_rows = "".join(f"""
 
 
 
 
238
  <tr>
239
  <td style="padding: 8px; border-bottom: 1px solid #eee;"><a href="{g.link}" target="_blank">{g.link}</a></td>
240
  <td style="padding: 8px; border-bottom: 1px solid #eee; text-align: center;">{g.scraped_posts}</td>
241
  <td style="padding: 8px; border-bottom: 1px solid #eee; text-align: center;">{g.detected_posts}</td>
242
  <td style="padding: 8px; border-bottom: 1px solid #eee;">{"OK" if g.stage == "done" else "ERROR"}</td>
243
- </tr>""" for g in groups_run)
244
- summary_table_html = f"""<h3>Group Summary</h3><table style="width: 100%; border-collapse: collapse; margin-top: 8px; border: 1px solid #ddd;"><thead><tr style="background: #0f172a; color: #fff;"><th style="text-align: left; padding: 8px;">Group Link</th><th style="text-align: center; padding: 8px;">Posts Scraped</th><th style="text-align: center; padding: 8px;">Confirmed Posts</th><th style="text-align: left; padding: 8px;">Status</th></tr></thead><tbody>{table_rows}</tbody></table>"""
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  if all_confirmed_posts:
246
- posts_html = "".join(f"""
247
- <div style="margin-bottom: 25px; padding: 12px; border: 1px solid #ddd; border-radius: 5px; background-color: #fafafa;">
248
- <h4 style="margin-top: 0; margin-bottom: 8px;">Post ID: {p.get("id", "N/A")} | Urgency: {p.get("ai_analysis", {}).get("urgency_level", "N/A")} | Confidence: {p.get("ai_analysis", {}).get("confidence", "N/A")}</h4>
249
- <p style="margin: 5px 0;"><strong>Summary:</strong> {html.escape(p.get("ai_analysis", {}).get("medical_summary", "N/A"))}</p>
250
- <p style="margin: 5px 0;"><strong>Text:</strong></p>
251
- <pre style="white-space: pre-wrap; background-color: #f0f0f0; padding: 8px; border: 1px solid #eee; border-radius: 3px; font-family: monospace; font-size: 0.9em;">{html.escape(p.get("text", "N/A"))}</pre>
252
- <p style="margin: 5px 0;"><a href="{p.get("group_link", "#")}" target="_blank">View Group</a></p>
253
- </div>""" for p in all_confirmed_posts)
254
- else: posts_html = "<p>No confirmed medical posts were found during this run.</p>"
255
- return f"""<!DOCTYPE html><html><head><title>Hillside Medical Group - Confirmed Medical Posts Summary</title></head><body style="font-family: Arial, sans-serif; margin: 0; padding: 0; background-color: #f5f5f5;"><div style="max-width: 900px; margin: 20px auto; padding: 20px; background-color: #ffffff; border: 1px solid #e0e0e0; border-radius: 8px;"><div style="background: #1e3c72; color: #fff; padding: 16px 20px; border-radius: 6px 6px 0 0;"><h2 style="margin: 0;">Hillside Medical Group - Confirmed Medical Posts</h2><div style="font-size: 0.9em;">Run completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div></div><div style="padding: 16px;"><p><strong>Overall Summary:</strong> Processed {total_groups} groups, scraped {total_scraped} posts, found {total_confirmed} confirmed medical posts.</p><hr style="margin: 20px 0; border: 0; border-top: 1px solid #eee;">{summary_table_html}<hr style="margin: 20px 0; border: 0; border-top: 1px solid #eee;"><h3>Confirmed Posts Details</h3>{posts_html}</div><div style="margin-top: 20px; padding: 10px; font-size: 0.8em; color: #666; border-top: 1px solid #eee;"><p>This email contains posts identified as potentially seeking personal medical help. Please review and take appropriate action.</p><p><em>Note: The link provided is to the group. Direct post links are not currently extracted.</em></p></div></div></body></html>"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  state = PipelineState()
258
 
259
  def stream_process_lines(args: List[str], env: Optional[Dict[str, str]] = None, tag: str = "FINAL5") -> int:
260
  log(f"Exec: {' '.join(args)}", "info", tag)
261
- proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1, universal_newlines=True, env=env or os.environ.copy())
 
 
 
 
 
 
 
 
262
  def pump(pipe, name):
263
  for raw in pipe:
264
  line = (raw or "").rstrip("\n")
265
- if not line: continue
 
266
  if line.startswith("::"):
267
- try: handle_event_line(line)
268
- except Exception as e: log(f"event parse error: {e}", "error", tag)
 
 
269
  log(line, "info" if name == "stdout" else "warn", tag)
270
  t1 = threading.Thread(target=pump, args=(proc.stdout, "stdout"), daemon=True)
271
  t2 = threading.Thread(target=pump, args=(proc.stderr, "stderr"), daemon=True)
272
- t1.start(); t2.start()
 
273
  rc = proc.wait()
274
- t1.join(timeout=0.2); t2.join(timeout=0.2)
 
275
  log(f"Exit code: {rc}", "info", tag)
276
  return rc
277
 
@@ -286,10 +351,17 @@ def call_final5_for_group(group_url: str, out_json: str, analysis_json: str, rec
286
  "--cookies-file", COOKIES_PATH,
287
  "--headless"
288
  ]
289
- if GEMINI_KEYS: args.extend(["--gemini-keys", ",".join(GEMINI_KEYS)])
 
290
  env = os.environ.copy()
291
  env["PYTHONUNBUFFERED"] = "1"
292
  env["PYTHONIOENCODING"] = "utf-8"
 
 
 
 
 
 
293
  rc = stream_process_lines(args, env=env, tag="FINAL5")
294
  return {"ok": rc == 0, "code": rc}
295
 
@@ -297,44 +369,57 @@ def run_pipeline(recipients: List[str]):
297
  try:
298
  logs.clear()
299
  log("Pipeline starting", "info", "ORCHESTRATOR")
300
- state.running, state.message, state.progress, state.recipients = True, "initializing", 0, recipients
 
 
 
301
  state.groups.clear()
302
  links = read_groups(GROUPS_TXT)
303
  state.total = len(links)
304
  if not links:
305
  log("No groups found in groups.txt", "warn", "ORCHESTRATOR")
306
- state.message, state.running = "No groups", False
 
307
  return
308
  all_confirmed_posts = []
309
  for i, link in enumerate(links, start=1):
310
  reset_live_state(link)
311
  g = GroupRun(link=link, stage="running")
312
  state.groups.append(g)
313
- state.current, state.message, state.progress = i, f"Processing {link}", int(((i - 1) / max(1, state.total)) * 100)
 
 
314
  log(f"[{i}/{state.total}] Processing group: {link}", "info", "ORCHESTRATOR")
315
  slug = slugify(link)
316
- out_json, analysis_json = os.path.join(SCRAPE_OUTDIR, f"{slug}.json"), os.path.join(ANALYSIS_OUTDIR, f"analysis_{slug}.json")
317
- g.scraped_json, g.analysis_json = out_json, analysis_json
 
 
318
  result = call_final5_for_group(link, out_json, analysis_json, recipients)
319
  if not result.get("ok"):
320
- g.stage, g.error = "error", f"final5 exit code {result.get('code')}"
 
321
  log(f"final5 failed for {link}: code {result.get('code')}", "error", "ORCHESTRATOR")
322
  else:
323
  try:
324
  if os.path.exists(out_json):
325
- with open(out_json, "r", encoding="utf-8") as f: g.scraped_posts = len(json.load(f))
 
326
  if os.path.exists(analysis_json):
327
- with open(analysis_json, "r", encoding="utf-8") as f: a = json.load(f)
 
328
  g.detected_posts = a.get("confirmed_medical", 0)
329
  g.emails_sent_by_final5 = a.get("emails_sent", 0)
330
  confirmed_posts = a.get("posts", [])
331
  for post in confirmed_posts:
332
- if "group_link" not in post: post["group_link"] = link
 
333
  all_confirmed_posts.extend(confirmed_posts)
334
  g.stage = "done"
335
  log(f"Group done: scraped={g.scraped_posts}, confirmed={g.detected_posts}", "info", "ORCHESTRATOR")
336
  except Exception as e:
337
- g.stage, g.error = "error", f"parse_error: {e}"
 
338
  log(f"Parsing outputs failed for {link}: {e}", "error", "ORCHESTRATOR")
339
  state.progress = int((i / max(1, state.total)) * 100)
340
  try:
@@ -346,24 +431,33 @@ def run_pipeline(recipients: List[str]):
346
  log(f"Error building or sending consolidated email: {e}", "error", "ORCHESTRATOR")
347
  summary = {"run_date": datetime.now().isoformat(), "groups": [g.__dict__ for g in state.groups]}
348
  summary_path = os.path.join(ANALYSIS_OUTDIR, "analysis_summary.json")
349
- with open(summary_path, "w", encoding="utf-8") as f: json.dump(summary, f, ensure_ascii=False, indent=2)
350
- state.summary_path, state.message, state.progress, state.running = summary_path, "All groups processed", 100, False
 
 
 
 
351
  log("Pipeline finished", "info", "ORCHESTRATOR")
352
  except Exception as e:
353
- state.message, state.running = f"pipeline_error: {e}", False
 
354
  log(f"Pipeline error: {e}\n{traceback.format_exc()}", "error", "ORCHESTRATOR")
355
 
356
  @app.route("/")
357
  def index():
358
- return send_from_directory('.', 'index.html')
359
 
360
  @app.get("/api/system/status")
361
  def system_status():
362
  return jsonify({
363
- "gmail": gmail_service is not None, "groups_file_exists": os.path.exists(GROUPS_TXT),
364
- "groups_count": len(read_groups(GROUPS_TXT)), "scrape_outdir": SCRAPE_OUTDIR,
365
- "analysis_outdir": ANALYSIS_OUTDIR, "sender_email": SENDER_EMAIL,
366
- "final5_exists": os.path.exists(FINAL5_PATH), "gemini_keys_count": len(GEMINI_KEYS)
 
 
 
 
367
  })
368
 
369
  @app.get("/api/groups")
@@ -372,18 +466,26 @@ def api_groups():
372
 
373
  @app.post("/api/process/start")
374
  def api_process_start():
375
- if state.running: return jsonify({"success": False, "message": "Already running"}), 409
 
376
  data = request.json or {}
377
  recips = data.get("recipients") or [SENDER_EMAIL]
378
- if isinstance(recips, str): recips = [e.strip() for e in recips.split(",") if e.strip()]
 
379
  threading.Thread(target=run_pipeline, args=(recips,), daemon=True).start()
380
  log(f"Start requested by client; recipients={recips}", "info", "API")
381
  return jsonify({"success": True, "message": "Pipeline started", "recipients": recips})
382
 
383
  @app.get("/api/process/status")
384
  def api_process_status():
385
- return jsonify({"running": state.running, "message": state.message, "progress": state.progress,
386
- "current": state.current, "total": state.total, "groups": [g.__dict__ for g in state.groups]})
 
 
 
 
 
 
387
 
388
  @app.get("/api/process/logs")
389
  def api_process_logs():
@@ -398,25 +500,31 @@ def api_clear_logs():
398
 
399
  @app.get("/api/live/state")
400
  def api_live_state():
401
- with live_lock: return jsonify({"success": True, "data": live_state})
 
402
 
403
  @app.get("/api/results/summary")
404
  def api_results_summary():
405
  p = state.summary_path or os.path.join(ANALYSIS_OUTDIR, "analysis_summary.json")
406
- if not os.path.exists(p): return jsonify({"success": False, "message": "No summary yet"}), 404
407
- with open(p, "r", encoding="utf-8") as f: return jsonify({"success": True, "data": json.load(f)})
 
 
408
 
409
  @app.get("/api/recipients")
410
  def api_get_recipients():
411
  recipients_path = "recipients.json"
412
- if not os.path.exists(recipients_path): return jsonify({"success": False, "message": "recipients.json not found"}), 404
 
413
  try:
414
- with open(recipients_path, "r", encoding="utf-8") as f: data = json.load(f)
415
- if not isinstance(data, list): return jsonify({"success": False, "message": "Invalid format"}), 500
 
 
416
  return jsonify({"success": True, "data": data})
417
  except Exception as e:
418
  return jsonify({"success": False, "message": f"Error reading file: {str(e)}"}), 500
419
 
420
  if __name__ == "__main__":
421
  port = int(os.environ.get("PORT", 7860))
422
- app.run(host="0.0.0.0", port=port)
 
1
+ import os
2
+ import re
3
+ import json
4
+ import time
5
+ import base64
6
+ import pickle
7
+ import subprocess
8
+ import threading
9
+ import traceback
10
+ import html
11
+ import binascii
12
  from datetime import datetime
13
  from dataclasses import dataclass, field
14
  from typing import List, Dict, Any, Optional
 
21
 
22
  load_dotenv()
23
 
 
 
 
24
  class LogBuffer:
25
  def __init__(self, max_items: int = 10000):
26
  self._buf: List[Dict[str, Any]] = []
 
31
  ts = datetime.now().strftime("%H:%M:%S")
32
  line = {"id": self._next_id, "ts": ts, "level": level, "source": source, "msg": msg}
33
  with self._lock:
34
+ self._buf.append(line)
35
+ self._next_id += 1
36
+ if len(self._buf) > self._max:
37
+ self._buf = self._buf[-self._max:]
38
  def clear(self):
39
+ with self._lock:
40
+ self._buf.clear()
41
  def get_after(self, after_id: int, limit: int = 500):
42
  with self._lock:
43
+ if after_id <= 0:
44
+ data = self._buf[-limit:]
45
+ else:
46
+ data = [x for x in self._buf if x["id"] > after_id][:limit]
47
  last_id = self._buf[-1]["id"] if self._buf else after_id
48
  return data, last_id
49
 
50
  logs = LogBuffer()
51
+
52
  def log(msg: str, level: str = "info", source: str = "server"):
53
  logs.append(msg, level, source)
54
  print(f"[{level.upper()}][{source}] {msg}", flush=True)
55
 
56
  def decode_base64_with_padding(b64_string: str) -> bytes:
 
57
  missing_padding = len(b64_string) % 4
58
  if missing_padding:
59
+ b64_string += "=" * (4 - missing_padding)
60
  try:
61
  return base64.b64decode(b64_string)
62
  except binascii.Error as e:
63
  log(f"Error decoding base64 string: {e}", "error", "SERVER")
64
  return b""
 
65
 
 
 
66
  WRITABLE_DIR = "/tmp"
67
  COOKIES_PATH = os.path.join(WRITABLE_DIR, "facebook_cookies.pkl")
68
  SERVICE_ACCOUNT_FILE = os.path.join(WRITABLE_DIR, "service_account.json")
69
 
70
+ if "FB_COOKIES_B64" in os.environ:
71
+ decoded_cookies = decode_base64_with_padding(os.environ["FB_COOKIES_B64"])
 
72
  if decoded_cookies:
73
+ with open(COOKIES_PATH, "wb") as f:
74
  f.write(decoded_cookies)
75
 
76
+ if "SERVICE_ACCOUNT_B64" in os.environ:
77
+ decoded_service_account = decode_base64_with_padding(os.environ["SERVICE_ACCOUNT_B64"])
78
  if decoded_service_account:
79
+ with open(SERVICE_ACCOUNT_FILE, "w") as f:
80
+ f.write(decoded_service_account.decode("utf-8"))
81
 
82
+ GROUPS_TXT = os.environ.get("GROUPS_TXT", "groups.txt")
83
+ FINAL5_PATH = os.environ.get("FINAL5_PATH", "final5.py")
84
+ PYTHON_BIN = os.environ.get("PYTHON_BIN", "python")
 
85
  SENDER_EMAIL = os.environ.get("SENDER_EMAIL", "smahato@hillsidemedicalgroup.com")
86
 
87
  SCRAPE_OUTDIR = os.path.join(WRITABLE_DIR, "scraped")
 
93
  if key:
94
  GEMINI_KEYS.append(key)
95
 
96
+ GMAIL_SCOPES = ["https://www.googleapis.com/auth/gmail.send"]
97
  os.makedirs(SCRAPE_OUTDIR, exist_ok=True)
98
  os.makedirs(ANALYSIS_OUTDIR, exist_ok=True)
99
 
 
 
100
  def build_gmail_service():
101
  if not os.path.exists(SERVICE_ACCOUNT_FILE):
102
  log("Service account file not found, Gmail unavailable.", "error", "GMAIL")
103
  return None
104
  try:
105
  creds = service_account.Credentials.from_service_account_file(
106
+ SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES
107
+ ).with_subject(SENDER_EMAIL)
108
  service = build("gmail", "v1", credentials=creds)
109
  log("Gmail service built successfully using service account.", "info", "GMAIL")
110
  return service
 
113
  log(f"CRITICAL: Ensure your service account has Domain-Wide Delegation enabled for the user {SENDER_EMAIL}", "error", "GMAIL")
114
  return None
115
 
 
116
  gmail_service = build_gmail_service()
117
 
 
 
118
  @dataclass
119
  class GroupRun:
120
  link: str
 
137
  recipients: List[str] = field(default_factory=list)
138
  summary_path: str = ""
139
 
140
+ app = Flask(__name__, static_folder=".", static_url_path="")
141
  CORS(app)
142
 
143
  live_lock = threading.Lock()
 
174
  live_state["counts"]["total_posts"] = len(posts)
175
 
176
  def handle_event_line(line: str):
177
+ if not line.startswith("::"):
178
+ return
179
  try:
180
  if "::SCRAPE_SAVED::" in line:
181
  path = line.split("::SCRAPE_SAVED::", 1)[1].strip()
182
+ if path:
183
+ load_scraped_into_live(path)
184
  elif "::KW_HIT::" in line:
185
  d = json.loads(line.split("::KW_HIT::", 1)[1].strip())
186
  p = ensure_post_obj(int(d["id"]))
187
  p["found_keywords"] = d.get("found_keywords", [])
188
+ with live_lock:
189
+ live_state["counts"]["kw_hits"] += 1
190
  elif "::AI_RESULT::" in line:
191
  d = json.loads(line.split("::AI_RESULT::", 1)[1].strip())
192
  p = ensure_post_obj(int(d["id"]))
 
194
  p["ai"] = ai
195
  with live_lock:
196
  live_state["counts"]["ai_done"] += 1
197
+ if ai.get("is_medical_seeking"):
198
+ live_state["counts"]["confirmed"] += 1
199
  elif "::EMAIL_SENT::" in line:
200
  d = json.loads(line.split("::EMAIL_SENT::", 1)[1].strip())
201
  p = ensure_post_obj(int(d["id"]))
202
  sent = int(d.get("sent", 0))
203
  p["email_sent"] = sent > 0
204
  if sent > 0:
205
+ with live_lock:
206
+ live_state["counts"]["emails"] += sent
207
  except Exception as e:
208
  log(f"live parse error: {e}", "error", "LIVE")
209
 
210
  def read_groups(path: str) -> List[str]:
211
+ if not os.path.exists(path):
212
+ return []
213
  with open(path, "r", encoding="utf-8") as f:
214
  return [ln.strip() for ln in f.read().splitlines() if ln.strip()]
215
 
 
240
  log(f"Gmail send error to {to}: {e}", "error", "gmail")
241
  return sent
242
 
243
+ def build_confirmed_posts_email(groups_run: List["GroupRun"], all_confirmed_posts: List[Dict[str, Any]]) -> str:
244
+ total_groups = len(groups_run)
245
+ total_scraped = sum(g.scraped_posts for g in groups_run)
246
+ total_confirmed = len(all_confirmed_posts)
247
+ rows = []
248
+ for g in groups_run:
249
+ rows.append(f"""
250
  <tr>
251
  <td style="padding: 8px; border-bottom: 1px solid #eee;"><a href="{g.link}" target="_blank">{g.link}</a></td>
252
  <td style="padding: 8px; border-bottom: 1px solid #eee; text-align: center;">{g.scraped_posts}</td>
253
  <td style="padding: 8px; border-bottom: 1px solid #eee; text-align: center;">{g.detected_posts}</td>
254
  <td style="padding: 8px; border-bottom: 1px solid #eee;">{"OK" if g.stage == "done" else "ERROR"}</td>
255
+ </tr>""")
256
+ summary_table_html = f"""<h3>Group Summary</h3>
257
+ <table style="width: 100%; border-collapse: collapse; margin-top: 8px; border: 1px solid #ddd;">
258
+ <thead>
259
+ <tr style="background: #0f172a; color: #fff;">
260
+ <th style="text-align: left; padding: 8px;">Group Link</th>
261
+ <th style="text-align: center; padding: 8px;">Posts Scraped</th>
262
+ <th style="text-align: center; padding: 8px;">Confirmed Posts</th>
263
+ <th style="text-align: left; padding: 8px;">Status</th>
264
+ </tr>
265
+ </thead>
266
+ <tbody>
267
+ {''.join(rows)}
268
+ </tbody>
269
+ </table>"""
270
  if all_confirmed_posts:
271
+ posts_html = "".join(
272
+ f"""
273
+ <div style="margin-bottom: 25px; padding: 12px; border: 1px solid #ddd; border-radius: 5px; background-color: #fafafa;">
274
+ <h4 style="margin-top: 0; margin-bottom: 8px;">Post ID: {p.get("id", "N/A")} | Urgency: {p.get("ai_analysis", {}).get("urgency_level", "N/A")} | Confidence: {p.get("ai_analysis", {}).get("confidence", "N/A")}</h4>
275
+ <p style="margin: 5px 0;"><strong>Summary:</strong> {html.escape(p.get("ai_analysis", {}).get("medical_summary", "N/A"))}</p>
276
+ <p style="margin: 5px 0;"><strong>Text:</strong></p>
277
+ <pre style="white-space: pre-wrap; background-color: #f0f0f0; padding: 8px; border: 1px solid #eee; border-radius: 3px; font-family: monospace; font-size: 0.9em;">{html.escape(p.get("text", "N/A"))}</pre>
278
+ <p style="margin: 5px 0;"><a href="{p.get("group_link", "#")}" target="_blank">View Group</a></p>
279
+ </div>"""
280
+ for p in all_confirmed_posts
281
+ )
282
+ else:
283
+ posts_html = "<p>No confirmed medical posts were found during this run.</p>"
284
+ return f"""<!DOCTYPE html>
285
+ <html>
286
+ <head><title>Hillside Medical Group - Confirmed Medical Posts Summary</title></head>
287
+ <body style="font-family: Arial, sans-serif; margin: 0; padding: 0; background-color: #f5f5f5;">
288
+ <div style="max-width: 900px; margin: 20px auto; padding: 20px; background-color: #ffffff; border: 1px solid #e0e0e0; border-radius: 8px;">
289
+ <div style="background: #1e3c72; color: #fff; padding: 16px 20px; border-radius: 6px 6px 0 0;">
290
+ <h2 style="margin: 0;">Hillside Medical Group - Confirmed Medical Posts</h2>
291
+ <div style="font-size: 0.9em;">Run completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
292
+ </div>
293
+ <div style="padding: 16px;">
294
+ <p><strong>Overall Summary:</strong> Processed {total_groups} groups, scraped {total_scraped} posts, found {total_confirmed} confirmed medical posts.</p>
295
+ <hr style="margin: 20px 0; border: 0; border-top: 1px solid #eee;">
296
+ {summary_table_html}
297
+ <hr style="margin: 20px 0; border: 0; border-top: 1px solid #eee;">
298
+ <h3>Confirmed Posts Details</h3>
299
+ {posts_html}
300
+ </div>
301
+ <div style="margin-top: 20px; padding: 10px; font-size: 0.8em; color: #666; border-top: 1px solid #eee;">
302
+ <p>This email contains posts identified as potentially seeking personal medical help. Please review and take appropriate action.</p>
303
+ <p><em>Note: The link provided is to the group. Direct post links are not currently extracted.</em></p>
304
+ </div>
305
+ </div>
306
+ </body>
307
+ </html>"""
308
 
309
  state = PipelineState()
310
 
311
  def stream_process_lines(args: List[str], env: Optional[Dict[str, str]] = None, tag: str = "FINAL5") -> int:
312
  log(f"Exec: {' '.join(args)}", "info", tag)
313
+ proc = subprocess.Popen(
314
+ args,
315
+ stdout=subprocess.PIPE,
316
+ stderr=subprocess.PIPE,
317
+ text=True,
318
+ bufsize=1,
319
+ universal_newlines=True,
320
+ env=env or os.environ.copy()
321
+ )
322
  def pump(pipe, name):
323
  for raw in pipe:
324
  line = (raw or "").rstrip("\n")
325
+ if not line:
326
+ continue
327
  if line.startswith("::"):
328
+ try:
329
+ handle_event_line(line)
330
+ except Exception as e:
331
+ log(f"event parse error: {e}", "error", tag)
332
  log(line, "info" if name == "stdout" else "warn", tag)
333
  t1 = threading.Thread(target=pump, args=(proc.stdout, "stdout"), daemon=True)
334
  t2 = threading.Thread(target=pump, args=(proc.stderr, "stderr"), daemon=True)
335
+ t1.start()
336
+ t2.start()
337
  rc = proc.wait()
338
+ t1.join(timeout=0.2)
339
+ t2.join(timeout=0.2)
340
  log(f"Exit code: {rc}", "info", tag)
341
  return rc
342
 
 
351
  "--cookies-file", COOKIES_PATH,
352
  "--headless"
353
  ]
354
+ if GEMINI_KEYS:
355
+ args.extend(["--gemini-keys", ",".join(GEMINI_KEYS)])
356
  env = os.environ.copy()
357
  env["PYTHONUNBUFFERED"] = "1"
358
  env["PYTHONIOENCODING"] = "utf-8"
359
+ env.setdefault("HOME", WRITABLE_DIR)
360
+ env.setdefault("WDM_LOCAL", "1")
361
+ env.setdefault("WDM_CACHE_DIR", os.path.join(WRITABLE_DIR, ".wdm"))
362
+ env.setdefault("SE_MANAGER_DRIVER_CACHE", os.path.join(WRITABLE_DIR, "selenium"))
363
+ os.makedirs(env["WDM_CACHE_DIR"], exist_ok=True)
364
+ os.makedirs(env["SE_MANAGER_DRIVER_CACHE"], exist_ok=True)
365
  rc = stream_process_lines(args, env=env, tag="FINAL5")
366
  return {"ok": rc == 0, "code": rc}
367
 
 
369
  try:
370
  logs.clear()
371
  log("Pipeline starting", "info", "ORCHESTRATOR")
372
+ state.running = True
373
+ state.message = "initializing"
374
+ state.progress = 0
375
+ state.recipients = recipients
376
  state.groups.clear()
377
  links = read_groups(GROUPS_TXT)
378
  state.total = len(links)
379
  if not links:
380
  log("No groups found in groups.txt", "warn", "ORCHESTRATOR")
381
+ state.message = "No groups"
382
+ state.running = False
383
  return
384
  all_confirmed_posts = []
385
  for i, link in enumerate(links, start=1):
386
  reset_live_state(link)
387
  g = GroupRun(link=link, stage="running")
388
  state.groups.append(g)
389
+ state.current = i
390
+ state.message = f"Processing {link}"
391
+ state.progress = int(((i - 1) / max(1, state.total)) * 100)
392
  log(f"[{i}/{state.total}] Processing group: {link}", "info", "ORCHESTRATOR")
393
  slug = slugify(link)
394
+ out_json = os.path.join(SCRAPE_OUTDIR, f"{slug}.json")
395
+ analysis_json = os.path.join(ANALYSIS_OUTDIR, f"analysis_{slug}.json")
396
+ g.scraped_json = out_json
397
+ g.analysis_json = analysis_json
398
  result = call_final5_for_group(link, out_json, analysis_json, recipients)
399
  if not result.get("ok"):
400
+ g.stage = "error"
401
+ g.error = f"final5 exit code {result.get('code')}"
402
  log(f"final5 failed for {link}: code {result.get('code')}", "error", "ORCHESTRATOR")
403
  else:
404
  try:
405
  if os.path.exists(out_json):
406
+ with open(out_json, "r", encoding="utf-8") as f:
407
+ g.scraped_posts = len(json.load(f))
408
  if os.path.exists(analysis_json):
409
+ with open(analysis_json, "r", encoding="utf-8") as f:
410
+ a = json.load(f)
411
  g.detected_posts = a.get("confirmed_medical", 0)
412
  g.emails_sent_by_final5 = a.get("emails_sent", 0)
413
  confirmed_posts = a.get("posts", [])
414
  for post in confirmed_posts:
415
+ if "group_link" not in post:
416
+ post["group_link"] = link
417
  all_confirmed_posts.extend(confirmed_posts)
418
  g.stage = "done"
419
  log(f"Group done: scraped={g.scraped_posts}, confirmed={g.detected_posts}", "info", "ORCHESTRATOR")
420
  except Exception as e:
421
+ g.stage = "error"
422
+ g.error = f"parse_error: {e}"
423
  log(f"Parsing outputs failed for {link}: {e}", "error", "ORCHESTRATOR")
424
  state.progress = int((i / max(1, state.total)) * 100)
425
  try:
 
431
  log(f"Error building or sending consolidated email: {e}", "error", "ORCHESTRATOR")
432
  summary = {"run_date": datetime.now().isoformat(), "groups": [g.__dict__ for g in state.groups]}
433
  summary_path = os.path.join(ANALYSIS_OUTDIR, "analysis_summary.json")
434
+ with open(summary_path, "w", encoding="utf-8") as f:
435
+ json.dump(summary, f, ensure_ascii=False, indent=2)
436
+ state.summary_path = summary_path
437
+ state.message = "All groups processed"
438
+ state.progress = 100
439
+ state.running = False
440
  log("Pipeline finished", "info", "ORCHESTRATOR")
441
  except Exception as e:
442
+ state.message = f"pipeline_error: {e}"
443
+ state.running = False
444
  log(f"Pipeline error: {e}\n{traceback.format_exc()}", "error", "ORCHESTRATOR")
445
 
446
  @app.route("/")
447
  def index():
448
+ return send_from_directory(".", "index.html")
449
 
450
  @app.get("/api/system/status")
451
  def system_status():
452
  return jsonify({
453
+ "gmail": gmail_service is not None,
454
+ "groups_file_exists": os.path.exists(GROUPS_TXT),
455
+ "groups_count": len(read_groups(GROUPS_TXT)),
456
+ "scrape_outdir": SCRAPE_OUTDIR,
457
+ "analysis_outdir": ANALYSIS_OUTDIR,
458
+ "sender_email": SENDER_EMAIL,
459
+ "final5_exists": os.path.exists(FINAL5_PATH),
460
+ "gemini_keys_count": len(GEMINI_KEYS)
461
  })
462
 
463
  @app.get("/api/groups")
 
466
 
467
  @app.post("/api/process/start")
468
  def api_process_start():
469
+ if state.running:
470
+ return jsonify({"success": False, "message": "Already running"}), 409
471
  data = request.json or {}
472
  recips = data.get("recipients") or [SENDER_EMAIL]
473
+ if isinstance(recips, str):
474
+ recips = [e.strip() for e in recips.split(",") if e.strip()]
475
  threading.Thread(target=run_pipeline, args=(recips,), daemon=True).start()
476
  log(f"Start requested by client; recipients={recips}", "info", "API")
477
  return jsonify({"success": True, "message": "Pipeline started", "recipients": recips})
478
 
479
  @app.get("/api/process/status")
480
  def api_process_status():
481
+ return jsonify({
482
+ "running": state.running,
483
+ "message": state.message,
484
+ "progress": state.progress,
485
+ "current": state.current,
486
+ "total": state.total,
487
+ "groups": [g.__dict__ for g in state.groups]
488
+ })
489
 
490
  @app.get("/api/process/logs")
491
  def api_process_logs():
 
500
 
501
  @app.get("/api/live/state")
502
  def api_live_state():
503
+ with live_lock:
504
+ return jsonify({"success": True, "data": live_state})
505
 
506
  @app.get("/api/results/summary")
507
  def api_results_summary():
508
  p = state.summary_path or os.path.join(ANALYSIS_OUTDIR, "analysis_summary.json")
509
+ if not os.path.exists(p):
510
+ return jsonify({"success": False, "message": "No summary yet"}), 404
511
+ with open(p, "r", encoding="utf-8") as f:
512
+ return jsonify({"success": True, "data": json.load(f)})
513
 
514
  @app.get("/api/recipients")
515
  def api_get_recipients():
516
  recipients_path = "recipients.json"
517
+ if not os.path.exists(recipients_path):
518
+ return jsonify({"success": False, "message": "recipients.json not found"}), 404
519
  try:
520
+ with open(recipients_path, "r", encoding="utf-8") as f:
521
+ data = json.load(f)
522
+ if not isinstance(data, list):
523
+ return jsonify({"success": False, "message": "Invalid format"}), 500
524
  return jsonify({"success": True, "data": data})
525
  except Exception as e:
526
  return jsonify({"success": False, "message": f"Error reading file: {str(e)}"}), 500
527
 
528
  if __name__ == "__main__":
529
  port = int(os.environ.get("PORT", 7860))
530
+ app.run(host="0.0.0.0", port=port)
final5.py CHANGED
@@ -1,5 +1,13 @@
1
- # final5.py
2
- import os, re, sys, time, json, base64, pickle, argparse, traceback, shutil
 
 
 
 
 
 
 
 
3
  from typing import List, Dict, Any, Tuple
4
  from datetime import datetime
5
  import tempfile
@@ -11,16 +19,16 @@ except Exception:
11
  pass
12
 
13
  from selenium import webdriver
14
- from selenium.webdriver.chrome.service import Service as ChromeService
15
  from selenium.webdriver.common.by import By
 
16
  from selenium.webdriver.support.ui import WebDriverWait
17
  from selenium.webdriver.support import expected_conditions as EC
18
- from selenium.common.exceptions import (
19
- StaleElementReferenceException, NoSuchElementException, TimeoutException
20
- )
21
  from google.oauth2 import service_account
22
  from googleapiclient.discovery import build
23
  from googleapiclient.errors import HttpError
 
24
  import google.generativeai as genai
25
  from google.api_core.exceptions import ResourceExhausted
26
 
@@ -35,20 +43,18 @@ def get_args():
35
  p.add_argument("--recipients", default="")
36
  p.add_argument("--sender", default=os.environ.get("SENDER_EMAIL", ""))
37
  p.add_argument("--cookies-file", default=os.path.join(WRITABLE_DIR, "facebook_cookies.pkl"))
38
- p.add_argument("--max-scrolls", type=int, default=int(os.environ.get("MAX_SCROLLS","5")))
39
- p.add_argument("--scroll-pause", type=float, default=float(os.environ.get("SCROLL_PAUSE","3")))
40
  p.add_argument("--gemini-keys", default="")
41
- p.add_argument("--headless", action="store_true", help="Prefer headless browser")
42
  return p.parse_args()
43
 
44
  def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
45
  options = webdriver.ChromeOptions()
46
  user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
47
-
48
- options.binary_location = "/usr/bin/google-chrome"
49
-
50
  options.add_argument(f"--user-data-dir={user_data_dir}")
51
- options.add_argument("--headless=new")
 
52
  options.add_argument("--no-sandbox")
53
  options.add_argument("--disable-dev-shm-usage")
54
  options.add_argument("--disable-gpu")
@@ -57,27 +63,34 @@ def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
57
  options.add_argument("--disable-extensions")
58
  options.add_argument("--remote-debugging-port=9222")
59
  options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
60
-
61
- service = ChromeService(executable_path="/usr/bin/chromedriver")
62
-
 
 
 
 
63
  driver = webdriver.Chrome(service=service, options=options)
64
-
65
- print("[SELENIUM] WebDriver session created successfully using system binaries.")
66
  return driver, user_data_dir
67
 
68
  def build_gmail_service():
69
  if os.path.exists(SERVICE_ACCOUNT_FILE):
70
  try:
71
  sender_email = os.environ.get("SENDER_EMAIL")
72
- if not sender_email: return None
 
73
  credentials = service_account.Credentials.from_service_account_file(
74
- SERVICE_ACCOUNT_FILE, scopes=["https://www.googleapis.com/auth/gmail.send"]).with_subject(sender_email)
 
 
75
  return build("gmail", "v1", credentials=credentials)
76
  except Exception as e:
77
  print(f"[GMAIL] Auth failed in final5.py: {e}")
78
  return None
79
 
80
  GEMINI_MODEL = "gemini-1.5-flash"
 
81
  class GeminiManager:
82
  def __init__(self, api_keys: List[str]):
83
  self.api_keys = api_keys
@@ -122,9 +135,19 @@ class GeminiManager:
122
  else:
123
  raise e
124
 
125
- def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
126
- fallback = { "is_medical_seeking": False, "confidence": "low", "medical_summary": "AI unavailable", "suggested_services": [], "urgency_level": "low", "analysis": "Fallback", "reasoning": "AI error", "matched_keywords": found_keywords }
127
- if not gemini_manager or not gemini_manager.is_available(): return fallback
 
 
 
 
 
 
 
 
 
 
128
  keywords_str = ", ".join(found_keywords) if found_keywords else "none"
129
  prompt = f"""Analyze this social post to determine if the author is seeking medical help for a personal health need.
130
  KEYWORDS: {keywords_str}
@@ -135,19 +158,25 @@ RULES:
135
  Post: "{post_text}"
136
  Return ONLY JSON:
137
  {{
138
- "is_medical_seeking": true/false, "confidence": "high/medium/low", "medical_summary": "short summary",
139
- "suggested_services": ["service1","service2"], "urgency_level": "high/medium/low",
140
- "analysis": "why it's seeking help", "reasoning": "short explanation", "matched_keywords": ["keyword1"]
 
 
 
 
 
141
  }}"""
142
  for _ in range(2):
143
  try:
144
  resp = gemini_manager.generate_content(prompt)
145
- txt = (resp.text or "").strip()
146
  s, e = txt.find("{"), txt.rfind("}") + 1
147
  if s >= 0 and e > s:
148
  result = json.loads(txt[s:e])
149
  result["is_medical_seeking"] = bool(result.get("is_medical_seeking", False))
150
- if "matched_keywords" not in result: result["matched_keywords"] = found_keywords
 
151
  return result
152
  return fallback
153
  except Exception as e:
@@ -155,7 +184,14 @@ Return ONLY JSON:
155
  gemini_manager.rotate_key()
156
  return fallback
157
 
158
- MEDICAL_KEYWORDS = [ "doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care","emergency","er","specialist","pediatrician","dentist","gynecologist","obgyn","women's health","health center","family doctor","maternity","prenatal","postnatal","labor","delivery","need doctor","looking for doctor","find doctor","recommend doctor","medical help","health help","appointment","checkup","treatment","prescription","medicine","surgery","best hospital","best clinic","where to go","doctor recommendation","pregnancy","birth control","contraception","fertility","hillside","medical group","wellness center" ]
 
 
 
 
 
 
 
159
 
160
  def contains_keywords(text: str) -> Tuple[bool, List[str]]:
161
  tl = (text or "").lower()
@@ -166,22 +202,20 @@ def load_cookies(driver, cookies_file: str):
166
  print("[FB] Navigating to Facebook homepage to load cookies...")
167
  driver.get("https://www.facebook.com")
168
  time.sleep(2)
169
-
170
  if not os.path.exists(cookies_file):
171
  raise RuntimeError(f"[FB] FATAL: Cookies file not found at {cookies_file}")
172
-
173
  with open(cookies_file, "rb") as f:
174
  cookies = pickle.load(f)
175
-
176
  for cookie in cookies:
177
- if "sameSite" in cookie and cookie["sameSite"] not in ["Strict","Lax","None"]:
178
  cookie["sameSite"] = "Lax"
179
- driver.add_cookie(cookie)
180
-
 
 
181
  print("[FB] All cookies loaded. Refreshing page to apply session...")
182
  driver.refresh()
183
  time.sleep(5)
184
-
185
  if "log in" in driver.title.lower():
186
  print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
187
  else:
@@ -204,14 +238,15 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
204
  print(f"[SCRAPE] --- Scroll {s+1}/{max_scrolls} ---")
205
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
206
  time.sleep(pause)
207
-
208
  divs = driver.find_elements(By.XPATH, "//div[@role='article']")
209
  added_this_scroll = 0
210
  for d in divs:
211
  try:
212
  txt = (d.text or "").strip()
213
- if len(txt) < 25 or txt in seen: continue
214
- if any(ui in txt for ui in ["Comment Share", "Write a comment...", "View more comments"]): continue
 
 
215
  seen.add(txt)
216
  posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
217
  added_this_scroll += 1
@@ -227,7 +262,7 @@ def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int
227
  posts = []
228
  try:
229
  driver, user_data_dir = new_driver(headless=True)
230
- wait = WebDriverWait(driver, 20)
231
  load_cookies(driver, cookies_file)
232
  posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
233
  except Exception as e:
@@ -235,8 +270,10 @@ def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int
235
  raise
236
  finally:
237
  if driver:
238
- try: driver.quit()
239
- except Exception: pass
 
 
240
  if user_data_dir and os.path.exists(user_data_dir):
241
  try:
242
  shutil.rmtree(user_data_dir, ignore_errors=True)
@@ -249,42 +286,39 @@ def main():
249
  args = get_args()
250
  os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
251
  os.makedirs(os.path.dirname(args.analysis_out) or ".", exist_ok=True)
252
-
253
  gemini_keys = [k.strip() for k in args.gemini_keys.split(",") if k.strip()] if args.gemini_keys else []
254
  gemini_manager = GeminiManager(gemini_keys)
255
-
256
- posts = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
257
-
258
  with open(args.out, "w", encoding="utf-8") as f:
259
  json.dump(posts, f, ensure_ascii=False, indent=2)
260
  print(f"[SCRAPE] Saved {len(posts)} scraped posts to {args.out}")
261
  print(f"::SCRAPE_SAVED::{args.out}")
262
-
263
  keyword_hits, confirmed = [], []
264
  for p in posts:
265
- has, hits = contains_keywords(p.get("text",""))
266
  if has:
267
  p["found_keywords"] = hits
268
  keyword_hits.append(p)
269
  print(f"::KW_HIT::{json.dumps({'id': p['id'], 'found_keywords': hits}, ensure_ascii=False)}")
270
-
271
  per_call_sleep = 5
272
  for idx, p in enumerate(keyword_hits, start=1):
273
  found_kws = p.get("found_keywords", [])
274
- ai = ai_medical_intent(gemini_manager, p.get("text",""), found_kws)
275
  p["ai_analysis"] = ai
276
  print(f"::AI_RESULT::{json.dumps({'id': p['id'], 'ai': ai}, ensure_ascii=False)}")
277
  if ai.get("is_medical_seeking"):
278
  confirmed.append(p)
279
  if idx < len(keyword_hits):
280
  time.sleep(per_call_sleep)
281
-
282
  report = {
283
- "analysis_date": datetime.now().isoformat(), "group_link": args.group,
284
- "total_posts": len(posts), "keyword_hits": len(keyword_hits),
285
- "confirmed_medical": len(confirmed), "emails_sent": 0, "posts": confirmed
 
 
 
 
286
  }
287
-
288
  with open(args.analysis_out, "w", encoding="utf-8") as f:
289
  json.dump(report, f, ensure_ascii=False, indent=2)
290
  print(f"[ANALYSIS] Saved analysis to {args.analysis_out}")
 
1
+ import os
2
+ import re
3
+ import sys
4
+ import time
5
+ import json
6
+ import base64
7
+ import pickle
8
+ import argparse
9
+ import traceback
10
+ import shutil
11
  from typing import List, Dict, Any, Tuple
12
  from datetime import datetime
13
  import tempfile
 
19
  pass
20
 
21
  from selenium import webdriver
 
22
  from selenium.webdriver.common.by import By
23
+ from selenium.webdriver.chrome.service import Service as ChromeService
24
  from selenium.webdriver.support.ui import WebDriverWait
25
  from selenium.webdriver.support import expected_conditions as EC
26
+ from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException, TimeoutException
27
+
 
28
  from google.oauth2 import service_account
29
  from googleapiclient.discovery import build
30
  from googleapiclient.errors import HttpError
31
+
32
  import google.generativeai as genai
33
  from google.api_core.exceptions import ResourceExhausted
34
 
 
43
  p.add_argument("--recipients", default="")
44
  p.add_argument("--sender", default=os.environ.get("SENDER_EMAIL", ""))
45
  p.add_argument("--cookies-file", default=os.path.join(WRITABLE_DIR, "facebook_cookies.pkl"))
46
+ p.add_argument("--max-scrolls", type=int, default=int(os.environ.get("MAX_SCROLLS", "5")))
47
+ p.add_argument("--scroll-pause", type=float, default=float(os.environ.get("SCROLL_PAUSE", "3")))
48
  p.add_argument("--gemini-keys", default="")
49
+ p.add_argument("--headless", action="store_true")
50
  return p.parse_args()
51
 
52
  def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
53
  options = webdriver.ChromeOptions()
54
  user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
 
 
 
55
  options.add_argument(f"--user-data-dir={user_data_dir}")
56
+ if headless:
57
+ options.add_argument("--headless=new")
58
  options.add_argument("--no-sandbox")
59
  options.add_argument("--disable-dev-shm-usage")
60
  options.add_argument("--disable-gpu")
 
63
  options.add_argument("--disable-extensions")
64
  options.add_argument("--remote-debugging-port=9222")
65
  options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
66
+ os.environ.setdefault("HOME", WRITABLE_DIR)
67
+ os.environ.setdefault("WDM_LOCAL", "1")
68
+ os.environ.setdefault("WDM_CACHE_DIR", os.path.join(WRITABLE_DIR, ".wdm"))
69
+ os.environ.setdefault("SE_MANAGER_DRIVER_CACHE", os.path.join(WRITABLE_DIR, "selenium"))
70
+ os.makedirs(os.environ["WDM_CACHE_DIR"], exist_ok=True)
71
+ os.makedirs(os.environ["SE_MANAGER_DRIVER_CACHE"], exist_ok=True)
72
+ service = ChromeService()
73
  driver = webdriver.Chrome(service=service, options=options)
74
+ print("[SELENIUM] WebDriver session created successfully.")
 
75
  return driver, user_data_dir
76
 
77
  def build_gmail_service():
78
  if os.path.exists(SERVICE_ACCOUNT_FILE):
79
  try:
80
  sender_email = os.environ.get("SENDER_EMAIL")
81
+ if not sender_email:
82
+ return None
83
  credentials = service_account.Credentials.from_service_account_file(
84
+ SERVICE_ACCOUNT_FILE,
85
+ scopes=["https://www.googleapis.com/auth/gmail.send"]
86
+ ).with_subject(sender_email)
87
  return build("gmail", "v1", credentials=credentials)
88
  except Exception as e:
89
  print(f"[GMAIL] Auth failed in final5.py: {e}")
90
  return None
91
 
92
  GEMINI_MODEL = "gemini-1.5-flash"
93
+
94
  class GeminiManager:
95
  def __init__(self, api_keys: List[str]):
96
  self.api_keys = api_keys
 
135
  else:
136
  raise e
137
 
138
+ def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str, Any]:
139
+ fallback = {
140
+ "is_medical_seeking": False,
141
+ "confidence": "low",
142
+ "medical_summary": "AI unavailable",
143
+ "suggested_services": [],
144
+ "urgency_level": "low",
145
+ "analysis": "Fallback",
146
+ "reasoning": "AI error",
147
+ "matched_keywords": found_keywords
148
+ }
149
+ if not gemini_manager or not gemini_manager.is_available():
150
+ return fallback
151
  keywords_str = ", ".join(found_keywords) if found_keywords else "none"
152
  prompt = f"""Analyze this social post to determine if the author is seeking medical help for a personal health need.
153
  KEYWORDS: {keywords_str}
 
158
  Post: "{post_text}"
159
  Return ONLY JSON:
160
  {{
161
+ "is_medical_seeking": true/false,
162
+ "confidence": "high/medium/low",
163
+ "medical_summary": "short summary",
164
+ "suggested_services": ["service1","service2"],
165
+ "urgency_level": "high/medium/low",
166
+ "analysis": "why it's seeking help",
167
+ "reasoning": "short explanation",
168
+ "matched_keywords": ["keyword1"]
169
  }}"""
170
  for _ in range(2):
171
  try:
172
  resp = gemini_manager.generate_content(prompt)
173
+ txt = (getattr(resp, "text", "") or "").strip()
174
  s, e = txt.find("{"), txt.rfind("}") + 1
175
  if s >= 0 and e > s:
176
  result = json.loads(txt[s:e])
177
  result["is_medical_seeking"] = bool(result.get("is_medical_seeking", False))
178
+ if "matched_keywords" not in result:
179
+ result["matched_keywords"] = found_keywords
180
  return result
181
  return fallback
182
  except Exception as e:
 
184
  gemini_manager.rotate_key()
185
  return fallback
186
 
187
+ MEDICAL_KEYWORDS = [
188
+ "doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care","emergency","er",
189
+ "specialist","pediatrician","dentist","gynecologist","obgyn","women's health","health center","family doctor",
190
+ "maternity","prenatal","postnatal","labor","delivery","need doctor","looking for doctor","find doctor",
191
+ "recommend doctor","medical help","health help","appointment","checkup","treatment","prescription","medicine",
192
+ "surgery","best hospital","best clinic","where to go","doctor recommendation","pregnancy","birth control",
193
+ "contraception","fertility","hillside","medical group","wellness center"
194
+ ]
195
 
196
  def contains_keywords(text: str) -> Tuple[bool, List[str]]:
197
  tl = (text or "").lower()
 
202
  print("[FB] Navigating to Facebook homepage to load cookies...")
203
  driver.get("https://www.facebook.com")
204
  time.sleep(2)
 
205
  if not os.path.exists(cookies_file):
206
  raise RuntimeError(f"[FB] FATAL: Cookies file not found at {cookies_file}")
 
207
  with open(cookies_file, "rb") as f:
208
  cookies = pickle.load(f)
 
209
  for cookie in cookies:
210
+ if "sameSite" in cookie and cookie["sameSite"] not in ["Strict", "Lax", "None"]:
211
  cookie["sameSite"] = "Lax"
212
+ try:
213
+ driver.add_cookie(cookie)
214
+ except Exception:
215
+ pass
216
  print("[FB] All cookies loaded. Refreshing page to apply session...")
217
  driver.refresh()
218
  time.sleep(5)
 
219
  if "log in" in driver.title.lower():
220
  print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
221
  else:
 
238
  print(f"[SCRAPE] --- Scroll {s+1}/{max_scrolls} ---")
239
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
240
  time.sleep(pause)
 
241
  divs = driver.find_elements(By.XPATH, "//div[@role='article']")
242
  added_this_scroll = 0
243
  for d in divs:
244
  try:
245
  txt = (d.text or "").strip()
246
+ if len(txt) < 25 or txt in seen:
247
+ continue
248
+ if any(ui in txt for ui in ["Comment Share", "Write a comment...", "View more comments"]):
249
+ continue
250
  seen.add(txt)
251
  posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
252
  added_this_scroll += 1
 
262
  posts = []
263
  try:
264
  driver, user_data_dir = new_driver(headless=True)
265
+ wait = WebDriverWait(driver, 30)
266
  load_cookies(driver, cookies_file)
267
  posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
268
  except Exception as e:
 
270
  raise
271
  finally:
272
  if driver:
273
+ try:
274
+ driver.quit()
275
+ except Exception:
276
+ pass
277
  if user_data_dir and os.path.exists(user_data_dir):
278
  try:
279
  shutil.rmtree(user_data_dir, ignore_errors=True)
 
286
  args = get_args()
287
  os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
288
  os.makedirs(os.path.dirname(args.analysis_out) or ".", exist_ok=True)
 
289
  gemini_keys = [k.strip() for k in args.gemini_keys.split(",") if k.strip()] if args.gemini_keys else []
290
  gemini_manager = GeminiManager(gemini_keys)
291
+ posts = try_scrape_with_fallback(args.group.strip(), args.cookies_file, args.max_scrolls, args.scroll_pause)
 
 
292
  with open(args.out, "w", encoding="utf-8") as f:
293
  json.dump(posts, f, ensure_ascii=False, indent=2)
294
  print(f"[SCRAPE] Saved {len(posts)} scraped posts to {args.out}")
295
  print(f"::SCRAPE_SAVED::{args.out}")
 
296
  keyword_hits, confirmed = [], []
297
  for p in posts:
298
+ has, hits = contains_keywords(p.get("text", ""))
299
  if has:
300
  p["found_keywords"] = hits
301
  keyword_hits.append(p)
302
  print(f"::KW_HIT::{json.dumps({'id': p['id'], 'found_keywords': hits}, ensure_ascii=False)}")
 
303
  per_call_sleep = 5
304
  for idx, p in enumerate(keyword_hits, start=1):
305
  found_kws = p.get("found_keywords", [])
306
+ ai = ai_medical_intent(gemini_manager, p.get("text", ""), found_kws)
307
  p["ai_analysis"] = ai
308
  print(f"::AI_RESULT::{json.dumps({'id': p['id'], 'ai': ai}, ensure_ascii=False)}")
309
  if ai.get("is_medical_seeking"):
310
  confirmed.append(p)
311
  if idx < len(keyword_hits):
312
  time.sleep(per_call_sleep)
 
313
  report = {
314
+ "analysis_date": datetime.now().isoformat(),
315
+ "group_link": args.group,
316
+ "total_posts": len(posts),
317
+ "keyword_hits": len(keyword_hits),
318
+ "confirmed_medical": len(confirmed),
319
+ "emails_sent": 0,
320
+ "posts": confirmed
321
  }
 
322
  with open(args.analysis_out, "w", encoding="utf-8") as f:
323
  json.dump(report, f, ensure_ascii=False, indent=2)
324
  print(f"[ANALYSIS] Saved analysis to {args.analysis_out}")