Spaces:

sonuprasad23
/

fb_scraper

Sleeping

App Files Files Community

sonuprasad23 commited on Sep 5, 2025

Commit

8a3c8d8

1 Parent(s): 9624e78

Project Uploaded

Browse files

Files changed (3) hide show

Dockerfile +15 -18
api_server.py +200 -92
final5.py +89 -55

Dockerfile CHANGED Viewed

@@ -1,44 +1,41 @@
-# Use a base image with Python
 FROM python:3.10-slim
-# Set the working directory
 WORKDIR /app
-# Set environment variables to prevent interactive prompts during installation
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 ENV PYTHONIOENCODING=utf-8
 ENV FLASK_APP=api_server.py
 ENV FLASK_RUN_HOST=0.0.0.0
 ENV FLASK_RUN_PORT=7860
-# Install system dependencies needed for apt-add-repository and Chrome
 RUN apt-get update && apt-get install -y \
     wget \
     gnupg \
     curl \
     unzip \
     && rm -rf /var/lib/apt/lists/*
-# --- START: CORRECTED CHROME INSTALLATION ---
-# This is the modern, secure way to add repository keys.
-RUN curl -sS -o - https://dl.google.com/linux/linux_signing_key.pub | gpg --dearmor | tee /etc/apt/keyrings/google-chrome.gpg >/dev/null \
-    && echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
-    && apt-get update \
-    && apt-get install -y google-chrome-stable \
-    && rm -rf /var/lib/apt/lists/*
-# --- END: CORRECTED CHROME INSTALLATION ---
-# Copy the requirements file and install Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy the rest of the application files
 COPY . .
-# Expose the port the app runs on (Hugging Face uses 7860 by default)
 EXPOSE 7860
-# Command to run the Flask application
-# Use gunicorn for a more robust server in production if needed, but flask dev server is fine for spaces
-CMD ["flask", "run", "--host=0.0.0.0", "--port=7860"]

 FROM python:3.10-slim
 WORKDIR /app
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 ENV PYTHONIOENCODING=utf-8
 ENV FLASK_APP=api_server.py
 ENV FLASK_RUN_HOST=0.0.0.0
 ENV FLASK_RUN_PORT=7860
+ENV HOME=/tmp
+ENV WDM_LOCAL=1
+ENV WDM_CACHE_DIR=/tmp/.wdm
+ENV SE_MANAGER_DRIVER_CACHE=/tmp/selenium
 RUN apt-get update && apt-get install -y \
     wget \
     gnupg \
     curl \
     unzip \
+    ca-certificates \
+    fonts-liberation \
     && rm -rf /var/lib/apt/lists/*
+RUN mkdir -p /etc/apt/keyrings && \
+    curl -sS -o - https://dl.google.com/linux/linux_signing_key.pub | gpg --dearmor | tee /etc/apt/keyrings/google-chrome.gpg >/dev/null && \
+    echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
+    apt-get update && \
+    apt-get install -y google-chrome-stable && \
+    rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
+RUN mkdir -p /tmp/.wdm /tmp/selenium
 EXPOSE 7860
+CMD ["flask", "run", "--host=0.0.0.0", "--port=7860"]

api_server.py CHANGED Viewed

@@ -1,4 +1,14 @@
-import os, re, json, time, base64, pickle, subprocess, threading, traceback, html, binascii
 from datetime import datetime
 from dataclasses import dataclass, field
 from typing import List, Dict, Any, Optional
@@ -11,9 +21,6 @@ from dotenv import load_dotenv
 load_dotenv()
-# --- START: CRITICAL DEFINITIONS ---
-# This block is moved to the top to guarantee 'log' exists before it's ever called.
 class LogBuffer:
     def __init__(self, max_items: int = 10000):
         self._buf: List[Dict[str, Any]] = []
@@ -24,57 +31,57 @@ class LogBuffer:
         ts = datetime.now().strftime("%H:%M:%S")
         line = {"id": self._next_id, "ts": ts, "level": level, "source": source, "msg": msg}
         with self._lock:
-            self._buf.append(line); self._next_id += 1
-            if len(self._buf) > self._max: self._buf = self._buf[-self._max:]
     def clear(self):
-        with self._lock: self._buf.clear()
     def get_after(self, after_id: int, limit: int = 500):
         with self._lock:
-            if after_id <= 0: data = self._buf[-limit:]
-            else: data = [x for x in self._buf if x["id"] > after_id][:limit]
             last_id = self._buf[-1]["id"] if self._buf else after_id
         return data, last_id
 logs = LogBuffer()
 def log(msg: str, level: str = "info", source: str = "server"):
     logs.append(msg, level, source)
     print(f"[{level.upper()}][{source}] {msg}", flush=True)
 def decode_base64_with_padding(b64_string: str) -> bytes:
-    """Decodes a Base64 string, adding missing padding if necessary."""
     missing_padding = len(b64_string) % 4
     if missing_padding:
-        b64_string += '=' * (4 - missing_padding)
     try:
         return base64.b64decode(b64_string)
     except binascii.Error as e:
         log(f"Error decoding base64 string: {e}", "error", "SERVER")
         return b""
-# --- END: CRITICAL DEFINITIONS ---
-# Define a writable directory for ALL runtime files
 WRITABLE_DIR = "/tmp"
 COOKIES_PATH = os.path.join(WRITABLE_DIR, "facebook_cookies.pkl")
 SERVICE_ACCOUNT_FILE = os.path.join(WRITABLE_DIR, "service_account.json")
-# Decode secrets at startup into the /tmp directory
-if 'FB_COOKIES_B64' in os.environ:
-    decoded_cookies = decode_base64_with_padding(os.environ['FB_COOKIES_B64'])
     if decoded_cookies:
-        with open(COOKIES_PATH, 'wb') as f:
             f.write(decoded_cookies)
-if 'SERVICE_ACCOUNT_B64' in os.environ:
-    decoded_service_account = decode_base64_with_padding(os.environ['SERVICE_ACCOUNT_B64'])
     if decoded_service_account:
-        with open(SERVICE_ACCOUNT_FILE, 'w') as f:
-            f.write(decoded_service_account.decode('utf-8'))
-# Define global constants
-GROUPS_TXT   = os.environ.get("GROUPS_TXT", "groups.txt")
-FINAL5_PATH  = os.environ.get("FINAL5_PATH", "final5.py")
-PYTHON_BIN   = os.environ.get("PYTHON_BIN", "python")
 SENDER_EMAIL = os.environ.get("SENDER_EMAIL", "smahato@hillsidemedicalgroup.com")
 SCRAPE_OUTDIR = os.path.join(WRITABLE_DIR, "scraped")
@@ -86,20 +93,18 @@ for i in range(1, 6):
     if key:
         GEMINI_KEYS.append(key)
-GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
 os.makedirs(SCRAPE_OUTDIR, exist_ok=True)
 os.makedirs(ANALYSIS_OUTDIR, exist_ok=True)
-# Define the Gmail service builder function
 def build_gmail_service():
     if not os.path.exists(SERVICE_ACCOUNT_FILE):
         log("Service account file not found, Gmail unavailable.", "error", "GMAIL")
         return None
     try:
         creds = service_account.Credentials.from_service_account_file(
-            SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(SENDER_EMAIL)
         service = build("gmail", "v1", credentials=creds)
         log("Gmail service built successfully using service account.", "info", "GMAIL")
         return service
@@ -108,11 +113,8 @@ def build_gmail_service():
         log(f"CRITICAL: Ensure your service account has Domain-Wide Delegation enabled for the user {SENDER_EMAIL}", "error", "GMAIL")
         return None
-# Now that all setup is done, build the service
 gmail_service = build_gmail_service()
-# --- The rest of the application code follows ---
 @dataclass
 class GroupRun:
     link: str
@@ -135,7 +137,7 @@ class PipelineState:
     recipients: List[str] = field(default_factory=list)
     summary_path: str = ""
-app = Flask(__name__, static_folder='.', static_url_path='')
 CORS(app)
 live_lock = threading.Lock()
@@ -172,16 +174,19 @@ def load_scraped_into_live(path: str):
         live_state["counts"]["total_posts"] = len(posts)
 def handle_event_line(line: str):
-    if not line.startswith("::"): return
     try:
         if "::SCRAPE_SAVED::" in line:
             path = line.split("::SCRAPE_SAVED::", 1)[1].strip()
-            if path: load_scraped_into_live(path)
         elif "::KW_HIT::" in line:
             d = json.loads(line.split("::KW_HIT::", 1)[1].strip())
             p = ensure_post_obj(int(d["id"]))
             p["found_keywords"] = d.get("found_keywords", [])
-            with live_lock: live_state["counts"]["kw_hits"] += 1
         elif "::AI_RESULT::" in line:
             d = json.loads(line.split("::AI_RESULT::", 1)[1].strip())
             p = ensure_post_obj(int(d["id"]))
@@ -189,19 +194,22 @@ def handle_event_line(line: str):
             p["ai"] = ai
             with live_lock:
                 live_state["counts"]["ai_done"] += 1
-                if ai.get("is_medical_seeking"): live_state["counts"]["confirmed"] += 1
         elif "::EMAIL_SENT::" in line:
             d = json.loads(line.split("::EMAIL_SENT::", 1)[1].strip())
             p = ensure_post_obj(int(d["id"]))
             sent = int(d.get("sent", 0))
             p["email_sent"] = sent > 0
             if sent > 0:
-                with live_lock: live_state["counts"]["emails"] += sent
     except Exception as e:
         log(f"live parse error: {e}", "error", "LIVE")
 def read_groups(path: str) -> List[str]:
-    if not os.path.exists(path): return []
     with open(path, "r", encoding="utf-8") as f:
         return [ln.strip() for ln in f.read().splitlines() if ln.strip()]
@@ -232,46 +240,103 @@ def send_html_email(to_emails: List[str], subject: str, html_content: str) -> in
             log(f"Gmail send error to {to}: {e}", "error", "gmail")
     return sent
-def build_confirmed_posts_email(groups_run: List[GroupRun], all_confirmed_posts: List[Dict[str, Any]]) -> str:
-    total_groups, total_scraped, total_confirmed = len(groups_run), sum(g.scraped_posts for g in groups_run), len(all_confirmed_posts)
-    table_rows = "".join(f"""
       <tr>
         <td style="padding: 8px; border-bottom: 1px solid #eee;"><a href="{g.link}" target="_blank">{g.link}</a></td>
         <td style="padding: 8px; border-bottom: 1px solid #eee; text-align: center;">{g.scraped_posts}</td>
         <td style="padding: 8px; border-bottom: 1px solid #eee; text-align: center;">{g.detected_posts}</td>
         <td style="padding: 8px; border-bottom: 1px solid #eee;">{"OK" if g.stage == "done" else "ERROR"}</td>
-      </tr>""" for g in groups_run)
-    summary_table_html = f"""<h3>Group Summary</h3><table style="width: 100%; border-collapse: collapse; margin-top: 8px; border: 1px solid #ddd;"><thead><tr style="background: #0f172a; color: #fff;"><th style="text-align: left; padding: 8px;">Group Link</th><th style="text-align: center; padding: 8px;">Posts Scraped</th><th style="text-align: center; padding: 8px;">Confirmed Posts</th><th style="text-align: left; padding: 8px;">Status</th></tr></thead><tbody>{table_rows}</tbody></table>"""
     if all_confirmed_posts:
-        posts_html = "".join(f"""
-            <div style="margin-bottom: 25px; padding: 12px; border: 1px solid #ddd; border-radius: 5px; background-color: #fafafa;">
-              <h4 style="margin-top: 0; margin-bottom: 8px;">Post ID: {p.get("id", "N/A")} | Urgency: {p.get("ai_analysis", {}).get("urgency_level", "N/A")} | Confidence: {p.get("ai_analysis", {}).get("confidence", "N/A")}</h4>
-              <p style="margin: 5px 0;"><strong>Summary:</strong> {html.escape(p.get("ai_analysis", {}).get("medical_summary", "N/A"))}</p>
-              <p style="margin: 5px 0;"><strong>Text:</strong></p>
-              <pre style="white-space: pre-wrap; background-color: #f0f0f0; padding: 8px; border: 1px solid #eee; border-radius: 3px; font-family: monospace; font-size: 0.9em;">{html.escape(p.get("text", "N/A"))}</pre>
-              <p style="margin: 5px 0;"><a href="{p.get("group_link", "#")}" target="_blank">View Group</a></p>
-            </div>""" for p in all_confirmed_posts)
-    else: posts_html = "<p>No confirmed medical posts were found during this run.</p>"
-    return f"""<!DOCTYPE html><html><head><title>Hillside Medical Group - Confirmed Medical Posts Summary</title></head><body style="font-family: Arial, sans-serif; margin: 0; padding: 0; background-color: #f5f5f5;"><div style="max-width: 900px; margin: 20px auto; padding: 20px; background-color: #ffffff; border: 1px solid #e0e0e0; border-radius: 8px;"><div style="background: #1e3c72; color: #fff; padding: 16px 20px; border-radius: 6px 6px 0 0;"><h2 style="margin: 0;">Hillside Medical Group - Confirmed Medical Posts</h2><div style="font-size: 0.9em;">Run completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div></div><div style="padding: 16px;"><p><strong>Overall Summary:</strong> Processed {total_groups} groups, scraped {total_scraped} posts, found {total_confirmed} confirmed medical posts.</p><hr style="margin: 20px 0; border: 0; border-top: 1px solid #eee;">{summary_table_html}<hr style="margin: 20px 0; border: 0; border-top: 1px solid #eee;"><h3>Confirmed Posts Details</h3>{posts_html}</div><div style="margin-top: 20px; padding: 10px; font-size: 0.8em; color: #666; border-top: 1px solid #eee;"><p>This email contains posts identified as potentially seeking personal medical help. Please review and take appropriate action.</p><p><em>Note: The link provided is to the group. Direct post links are not currently extracted.</em></p></div></div></body></html>"""
 state = PipelineState()
 def stream_process_lines(args: List[str], env: Optional[Dict[str, str]] = None, tag: str = "FINAL5") -> int:
     log(f"Exec: {' '.join(args)}", "info", tag)
-    proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1, universal_newlines=True, env=env or os.environ.copy())
     def pump(pipe, name):
         for raw in pipe:
             line = (raw or "").rstrip("\n")
-            if not line: continue
             if line.startswith("::"):
-                try: handle_event_line(line)
-                except Exception as e: log(f"event parse error: {e}", "error", tag)
             log(line, "info" if name == "stdout" else "warn", tag)
     t1 = threading.Thread(target=pump, args=(proc.stdout, "stdout"), daemon=True)
     t2 = threading.Thread(target=pump, args=(proc.stderr, "stderr"), daemon=True)
-    t1.start(); t2.start()
     rc = proc.wait()
-    t1.join(timeout=0.2); t2.join(timeout=0.2)
     log(f"Exit code: {rc}", "info", tag)
     return rc
@@ -286,10 +351,17 @@ def call_final5_for_group(group_url: str, out_json: str, analysis_json: str, rec
         "--cookies-file", COOKIES_PATH,
         "--headless"
     ]
-    if GEMINI_KEYS: args.extend(["--gemini-keys", ",".join(GEMINI_KEYS)])
     env = os.environ.copy()
     env["PYTHONUNBUFFERED"] = "1"
     env["PYTHONIOENCODING"] = "utf-8"
     rc = stream_process_lines(args, env=env, tag="FINAL5")
     return {"ok": rc == 0, "code": rc}
@@ -297,44 +369,57 @@ def run_pipeline(recipients: List[str]):
     try:
         logs.clear()
         log("Pipeline starting", "info", "ORCHESTRATOR")
-        state.running, state.message, state.progress, state.recipients = True, "initializing", 0, recipients
         state.groups.clear()
         links = read_groups(GROUPS_TXT)
         state.total = len(links)
         if not links:
             log("No groups found in groups.txt", "warn", "ORCHESTRATOR")
-            state.message, state.running = "No groups", False
             return
         all_confirmed_posts = []
         for i, link in enumerate(links, start=1):
             reset_live_state(link)
             g = GroupRun(link=link, stage="running")
             state.groups.append(g)
-            state.current, state.message, state.progress = i, f"Processing {link}", int(((i - 1) / max(1, state.total)) * 100)
             log(f"[{i}/{state.total}] Processing group: {link}", "info", "ORCHESTRATOR")
             slug = slugify(link)
-            out_json, analysis_json = os.path.join(SCRAPE_OUTDIR, f"{slug}.json"), os.path.join(ANALYSIS_OUTDIR, f"analysis_{slug}.json")
-            g.scraped_json, g.analysis_json = out_json, analysis_json
             result = call_final5_for_group(link, out_json, analysis_json, recipients)
             if not result.get("ok"):
-                g.stage, g.error = "error", f"final5 exit code {result.get('code')}"
                 log(f"final5 failed for {link}: code {result.get('code')}", "error", "ORCHESTRATOR")
             else:
                 try:
                     if os.path.exists(out_json):
-                        with open(out_json, "r", encoding="utf-8") as f: g.scraped_posts = len(json.load(f))
                     if os.path.exists(analysis_json):
-                        with open(analysis_json, "r", encoding="utf-8") as f: a = json.load(f)
                         g.detected_posts = a.get("confirmed_medical", 0)
                         g.emails_sent_by_final5 = a.get("emails_sent", 0)
                         confirmed_posts = a.get("posts", [])
                         for post in confirmed_posts:
-                            if "group_link" not in post: post["group_link"] = link
                         all_confirmed_posts.extend(confirmed_posts)
                     g.stage = "done"
                     log(f"Group done: scraped={g.scraped_posts}, confirmed={g.detected_posts}", "info", "ORCHESTRATOR")
                 except Exception as e:
-                    g.stage, g.error = "error", f"parse_error: {e}"
                     log(f"Parsing outputs failed for {link}: {e}", "error", "ORCHESTRATOR")
             state.progress = int((i / max(1, state.total)) * 100)
         try:
@@ -346,24 +431,33 @@ def run_pipeline(recipients: List[str]):
             log(f"Error building or sending consolidated email: {e}", "error", "ORCHESTRATOR")
         summary = {"run_date": datetime.now().isoformat(), "groups": [g.__dict__ for g in state.groups]}
         summary_path = os.path.join(ANALYSIS_OUTDIR, "analysis_summary.json")
-        with open(summary_path, "w", encoding="utf-8") as f: json.dump(summary, f, ensure_ascii=False, indent=2)
-        state.summary_path, state.message, state.progress, state.running = summary_path, "All groups processed", 100, False
         log("Pipeline finished", "info", "ORCHESTRATOR")
     except Exception as e:
-        state.message, state.running = f"pipeline_error: {e}", False
         log(f"Pipeline error: {e}\n{traceback.format_exc()}", "error", "ORCHESTRATOR")
 @app.route("/")
 def index():
-    return send_from_directory('.', 'index.html')
 @app.get("/api/system/status")
 def system_status():
     return jsonify({
-        "gmail": gmail_service is not None, "groups_file_exists": os.path.exists(GROUPS_TXT),
-        "groups_count": len(read_groups(GROUPS_TXT)), "scrape_outdir": SCRAPE_OUTDIR,
-        "analysis_outdir": ANALYSIS_OUTDIR, "sender_email": SENDER_EMAIL,
-        "final5_exists": os.path.exists(FINAL5_PATH), "gemini_keys_count": len(GEMINI_KEYS)
     })
 @app.get("/api/groups")
@@ -372,18 +466,26 @@ def api_groups():
 @app.post("/api/process/start")
 def api_process_start():
-    if state.running: return jsonify({"success": False, "message": "Already running"}), 409
     data = request.json or {}
     recips = data.get("recipients") or [SENDER_EMAIL]
-    if isinstance(recips, str): recips = [e.strip() for e in recips.split(",") if e.strip()]
     threading.Thread(target=run_pipeline, args=(recips,), daemon=True).start()
     log(f"Start requested by client; recipients={recips}", "info", "API")
     return jsonify({"success": True, "message": "Pipeline started", "recipients": recips})
 @app.get("/api/process/status")
 def api_process_status():
-    return jsonify({"running": state.running, "message": state.message, "progress": state.progress,
-        "current": state.current, "total": state.total, "groups": [g.__dict__ for g in state.groups]})
 @app.get("/api/process/logs")
 def api_process_logs():
@@ -398,25 +500,31 @@ def api_clear_logs():
 @app.get("/api/live/state")
 def api_live_state():
-    with live_lock: return jsonify({"success": True, "data": live_state})
 @app.get("/api/results/summary")
 def api_results_summary():
     p = state.summary_path or os.path.join(ANALYSIS_OUTDIR, "analysis_summary.json")
-    if not os.path.exists(p): return jsonify({"success": False, "message": "No summary yet"}), 404
-    with open(p, "r", encoding="utf-8") as f: return jsonify({"success": True, "data": json.load(f)})
 @app.get("/api/recipients")
 def api_get_recipients():
     recipients_path = "recipients.json"
-    if not os.path.exists(recipients_path): return jsonify({"success": False, "message": "recipients.json not found"}), 404
     try:
-        with open(recipients_path, "r", encoding="utf-8") as f: data = json.load(f)
-        if not isinstance(data, list): return jsonify({"success": False, "message": "Invalid format"}), 500
         return jsonify({"success": True, "data": data})
     except Exception as e:
         return jsonify({"success": False, "message": f"Error reading file: {str(e)}"}), 500
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", 7860))
-    app.run(host="0.0.0.0", port=port)

+import os
+import re
+import json
+import time
+import base64
+import pickle
+import subprocess
+import threading
+import traceback
+import html
+import binascii
 from datetime import datetime
 from dataclasses import dataclass, field
 from typing import List, Dict, Any, Optional
 load_dotenv()
 class LogBuffer:
     def __init__(self, max_items: int = 10000):
         self._buf: List[Dict[str, Any]] = []
         ts = datetime.now().strftime("%H:%M:%S")
         line = {"id": self._next_id, "ts": ts, "level": level, "source": source, "msg": msg}
         with self._lock:
+            self._buf.append(line)
+            self._next_id += 1
+            if len(self._buf) > self._max:
+                self._buf = self._buf[-self._max:]
     def clear(self):
+        with self._lock:
+            self._buf.clear()
     def get_after(self, after_id: int, limit: int = 500):
         with self._lock:
+            if after_id <= 0:
+                data = self._buf[-limit:]
+            else:
+                data = [x for x in self._buf if x["id"] > after_id][:limit]
             last_id = self._buf[-1]["id"] if self._buf else after_id
         return data, last_id
 logs = LogBuffer()
 def log(msg: str, level: str = "info", source: str = "server"):
     logs.append(msg, level, source)
     print(f"[{level.upper()}][{source}] {msg}", flush=True)
 def decode_base64_with_padding(b64_string: str) -> bytes:
     missing_padding = len(b64_string) % 4
     if missing_padding:
+        b64_string += "=" * (4 - missing_padding)
     try:
         return base64.b64decode(b64_string)
     except binascii.Error as e:
         log(f"Error decoding base64 string: {e}", "error", "SERVER")
         return b""
 WRITABLE_DIR = "/tmp"
 COOKIES_PATH = os.path.join(WRITABLE_DIR, "facebook_cookies.pkl")
 SERVICE_ACCOUNT_FILE = os.path.join(WRITABLE_DIR, "service_account.json")
+if "FB_COOKIES_B64" in os.environ:
+    decoded_cookies = decode_base64_with_padding(os.environ["FB_COOKIES_B64"])
     if decoded_cookies:
+        with open(COOKIES_PATH, "wb") as f:
             f.write(decoded_cookies)
+if "SERVICE_ACCOUNT_B64" in os.environ:
+    decoded_service_account = decode_base64_with_padding(os.environ["SERVICE_ACCOUNT_B64"])
     if decoded_service_account:
+        with open(SERVICE_ACCOUNT_FILE, "w") as f:
+            f.write(decoded_service_account.decode("utf-8"))
+GROUPS_TXT = os.environ.get("GROUPS_TXT", "groups.txt")
+FINAL5_PATH = os.environ.get("FINAL5_PATH", "final5.py")
+PYTHON_BIN = os.environ.get("PYTHON_BIN", "python")
 SENDER_EMAIL = os.environ.get("SENDER_EMAIL", "smahato@hillsidemedicalgroup.com")
 SCRAPE_OUTDIR = os.path.join(WRITABLE_DIR, "scraped")
     if key:
         GEMINI_KEYS.append(key)
+GMAIL_SCOPES = ["https://www.googleapis.com/auth/gmail.send"]
 os.makedirs(SCRAPE_OUTDIR, exist_ok=True)
 os.makedirs(ANALYSIS_OUTDIR, exist_ok=True)
 def build_gmail_service():
     if not os.path.exists(SERVICE_ACCOUNT_FILE):
         log("Service account file not found, Gmail unavailable.", "error", "GMAIL")
         return None
     try:
         creds = service_account.Credentials.from_service_account_file(
+            SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES
+        ).with_subject(SENDER_EMAIL)
         service = build("gmail", "v1", credentials=creds)
         log("Gmail service built successfully using service account.", "info", "GMAIL")
         return service
         log(f"CRITICAL: Ensure your service account has Domain-Wide Delegation enabled for the user {SENDER_EMAIL}", "error", "GMAIL")
         return None
 gmail_service = build_gmail_service()
 @dataclass
 class GroupRun:
     link: str
     recipients: List[str] = field(default_factory=list)
     summary_path: str = ""
+app = Flask(__name__, static_folder=".", static_url_path="")
 CORS(app)
 live_lock = threading.Lock()
         live_state["counts"]["total_posts"] = len(posts)
 def handle_event_line(line: str):
+    if not line.startswith("::"):
+        return
     try:
         if "::SCRAPE_SAVED::" in line:
             path = line.split("::SCRAPE_SAVED::", 1)[1].strip()
+            if path:
+                load_scraped_into_live(path)
         elif "::KW_HIT::" in line:
             d = json.loads(line.split("::KW_HIT::", 1)[1].strip())
             p = ensure_post_obj(int(d["id"]))
             p["found_keywords"] = d.get("found_keywords", [])
+            with live_lock:
+                live_state["counts"]["kw_hits"] += 1
         elif "::AI_RESULT::" in line:
             d = json.loads(line.split("::AI_RESULT::", 1)[1].strip())
             p = ensure_post_obj(int(d["id"]))
             p["ai"] = ai
             with live_lock:
                 live_state["counts"]["ai_done"] += 1
+                if ai.get("is_medical_seeking"):
+                    live_state["counts"]["confirmed"] += 1
         elif "::EMAIL_SENT::" in line:
             d = json.loads(line.split("::EMAIL_SENT::", 1)[1].strip())
             p = ensure_post_obj(int(d["id"]))
             sent = int(d.get("sent", 0))
             p["email_sent"] = sent > 0
             if sent > 0:
+                with live_lock:
+                    live_state["counts"]["emails"] += sent
     except Exception as e:
         log(f"live parse error: {e}", "error", "LIVE")
 def read_groups(path: str) -> List[str]:
+    if not os.path.exists(path):
+        return []
     with open(path, "r", encoding="utf-8") as f:
         return [ln.strip() for ln in f.read().splitlines() if ln.strip()]
             log(f"Gmail send error to {to}: {e}", "error", "gmail")
     return sent
+def build_confirmed_posts_email(groups_run: List["GroupRun"], all_confirmed_posts: List[Dict[str, Any]]) -> str:
+    total_groups = len(groups_run)
+    total_scraped = sum(g.scraped_posts for g in groups_run)
+    total_confirmed = len(all_confirmed_posts)
+    rows = []
+    for g in groups_run:
+        rows.append(f"""
       <tr>
         <td style="padding: 8px; border-bottom: 1px solid #eee;"><a href="{g.link}" target="_blank">{g.link}</a></td>
         <td style="padding: 8px; border-bottom: 1px solid #eee; text-align: center;">{g.scraped_posts}</td>
         <td style="padding: 8px; border-bottom: 1px solid #eee; text-align: center;">{g.detected_posts}</td>
         <td style="padding: 8px; border-bottom: 1px solid #eee;">{"OK" if g.stage == "done" else "ERROR"}</td>
+      </tr>""")
+    summary_table_html = f"""<h3>Group Summary</h3>
+<table style="width: 100%; border-collapse: collapse; margin-top: 8px; border: 1px solid #ddd;">
+<thead>
+<tr style="background: #0f172a; color: #fff;">
+<th style="text-align: left; padding: 8px;">Group Link</th>
+<th style="text-align: center; padding: 8px;">Posts Scraped</th>
+<th style="text-align: center; padding: 8px;">Confirmed Posts</th>
+<th style="text-align: left; padding: 8px;">Status</th>
+</tr>
+</thead>
+<tbody>
+{''.join(rows)}
+</tbody>
+</table>"""
     if all_confirmed_posts:
+        posts_html = "".join(
+            f"""
+<div style="margin-bottom: 25px; padding: 12px; border: 1px solid #ddd; border-radius: 5px; background-color: #fafafa;">
+  <h4 style="margin-top: 0; margin-bottom: 8px;">Post ID: {p.get("id", "N/A")} | Urgency: {p.get("ai_analysis", {}).get("urgency_level", "N/A")} | Confidence: {p.get("ai_analysis", {}).get("confidence", "N/A")}</h4>
+  <p style="margin: 5px 0;"><strong>Summary:</strong> {html.escape(p.get("ai_analysis", {}).get("medical_summary", "N/A"))}</p>
+  <p style="margin: 5px 0;"><strong>Text:</strong></p>
+  <pre style="white-space: pre-wrap; background-color: #f0f0f0; padding: 8px; border: 1px solid #eee; border-radius: 3px; font-family: monospace; font-size: 0.9em;">{html.escape(p.get("text", "N/A"))}</pre>
+  <p style="margin: 5px 0;"><a href="{p.get("group_link", "#")}" target="_blank">View Group</a></p>
+</div>"""
+            for p in all_confirmed_posts
+        )
+    else:
+        posts_html = "<p>No confirmed medical posts were found during this run.</p>"
+    return f"""<!DOCTYPE html>
+<html>
+<head><title>Hillside Medical Group - Confirmed Medical Posts Summary</title></head>
+<body style="font-family: Arial, sans-serif; margin: 0; padding: 0; background-color: #f5f5f5;">
+<div style="max-width: 900px; margin: 20px auto; padding: 20px; background-color: #ffffff; border: 1px solid #e0e0e0; border-radius: 8px;">
+  <div style="background: #1e3c72; color: #fff; padding: 16px 20px; border-radius: 6px 6px 0 0;">
+    <h2 style="margin: 0;">Hillside Medical Group - Confirmed Medical Posts</h2>
+    <div style="font-size: 0.9em;">Run completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
+  </div>
+  <div style="padding: 16px;">
+    <p><strong>Overall Summary:</strong> Processed {total_groups} groups, scraped {total_scraped} posts, found {total_confirmed} confirmed medical posts.</p>
+    <hr style="margin: 20px 0; border: 0; border-top: 1px solid #eee;">
+    {summary_table_html}
+    <hr style="margin: 20px 0; border: 0; border-top: 1px solid #eee;">
+    <h3>Confirmed Posts Details</h3>
+    {posts_html}
+  </div>
+  <div style="margin-top: 20px; padding: 10px; font-size: 0.8em; color: #666; border-top: 1px solid #eee;">
+    <p>This email contains posts identified as potentially seeking personal medical help. Please review and take appropriate action.</p>
+    <p><em>Note: The link provided is to the group. Direct post links are not currently extracted.</em></p>
+  </div>
+</div>
+</body>
+</html>"""
 state = PipelineState()
 def stream_process_lines(args: List[str], env: Optional[Dict[str, str]] = None, tag: str = "FINAL5") -> int:
     log(f"Exec: {' '.join(args)}", "info", tag)
+    proc = subprocess.Popen(
+        args,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        bufsize=1,
+        universal_newlines=True,
+        env=env or os.environ.copy()
+    )
     def pump(pipe, name):
         for raw in pipe:
             line = (raw or "").rstrip("\n")
+            if not line:
+                continue
             if line.startswith("::"):
+                try:
+                    handle_event_line(line)
+                except Exception as e:
+                    log(f"event parse error: {e}", "error", tag)
             log(line, "info" if name == "stdout" else "warn", tag)
     t1 = threading.Thread(target=pump, args=(proc.stdout, "stdout"), daemon=True)
     t2 = threading.Thread(target=pump, args=(proc.stderr, "stderr"), daemon=True)
+    t1.start()
+    t2.start()
     rc = proc.wait()
+    t1.join(timeout=0.2)
+    t2.join(timeout=0.2)
     log(f"Exit code: {rc}", "info", tag)
     return rc
         "--cookies-file", COOKIES_PATH,
         "--headless"
     ]
+    if GEMINI_KEYS:
+        args.extend(["--gemini-keys", ",".join(GEMINI_KEYS)])
     env = os.environ.copy()
     env["PYTHONUNBUFFERED"] = "1"
     env["PYTHONIOENCODING"] = "utf-8"
+    env.setdefault("HOME", WRITABLE_DIR)
+    env.setdefault("WDM_LOCAL", "1")
+    env.setdefault("WDM_CACHE_DIR", os.path.join(WRITABLE_DIR, ".wdm"))
+    env.setdefault("SE_MANAGER_DRIVER_CACHE", os.path.join(WRITABLE_DIR, "selenium"))
+    os.makedirs(env["WDM_CACHE_DIR"], exist_ok=True)
+    os.makedirs(env["SE_MANAGER_DRIVER_CACHE"], exist_ok=True)
     rc = stream_process_lines(args, env=env, tag="FINAL5")
     return {"ok": rc == 0, "code": rc}
     try:
         logs.clear()
         log("Pipeline starting", "info", "ORCHESTRATOR")
+        state.running = True
+        state.message = "initializing"
+        state.progress = 0
+        state.recipients = recipients
         state.groups.clear()
         links = read_groups(GROUPS_TXT)
         state.total = len(links)
         if not links:
             log("No groups found in groups.txt", "warn", "ORCHESTRATOR")
+            state.message = "No groups"
+            state.running = False
             return
         all_confirmed_posts = []
         for i, link in enumerate(links, start=1):
             reset_live_state(link)
             g = GroupRun(link=link, stage="running")
             state.groups.append(g)
+            state.current = i
+            state.message = f"Processing {link}"
+            state.progress = int(((i - 1) / max(1, state.total)) * 100)
             log(f"[{i}/{state.total}] Processing group: {link}", "info", "ORCHESTRATOR")
             slug = slugify(link)
+            out_json = os.path.join(SCRAPE_OUTDIR, f"{slug}.json")
+            analysis_json = os.path.join(ANALYSIS_OUTDIR, f"analysis_{slug}.json")
+            g.scraped_json = out_json
+            g.analysis_json = analysis_json
             result = call_final5_for_group(link, out_json, analysis_json, recipients)
             if not result.get("ok"):
+                g.stage = "error"
+                g.error = f"final5 exit code {result.get('code')}"
                 log(f"final5 failed for {link}: code {result.get('code')}", "error", "ORCHESTRATOR")
             else:
                 try:
                     if os.path.exists(out_json):
+                        with open(out_json, "r", encoding="utf-8") as f:
+                            g.scraped_posts = len(json.load(f))
                     if os.path.exists(analysis_json):
+                        with open(analysis_json, "r", encoding="utf-8") as f:
+                            a = json.load(f)
                         g.detected_posts = a.get("confirmed_medical", 0)
                         g.emails_sent_by_final5 = a.get("emails_sent", 0)
                         confirmed_posts = a.get("posts", [])
                         for post in confirmed_posts:
+                            if "group_link" not in post:
+                                post["group_link"] = link
                         all_confirmed_posts.extend(confirmed_posts)
                     g.stage = "done"
                     log(f"Group done: scraped={g.scraped_posts}, confirmed={g.detected_posts}", "info", "ORCHESTRATOR")
                 except Exception as e:
+                    g.stage = "error"
+                    g.error = f"parse_error: {e}"
                     log(f"Parsing outputs failed for {link}: {e}", "error", "ORCHESTRATOR")
             state.progress = int((i / max(1, state.total)) * 100)
         try:
             log(f"Error building or sending consolidated email: {e}", "error", "ORCHESTRATOR")
         summary = {"run_date": datetime.now().isoformat(), "groups": [g.__dict__ for g in state.groups]}
         summary_path = os.path.join(ANALYSIS_OUTDIR, "analysis_summary.json")
+        with open(summary_path, "w", encoding="utf-8") as f:
+            json.dump(summary, f, ensure_ascii=False, indent=2)
+        state.summary_path = summary_path
+        state.message = "All groups processed"
+        state.progress = 100
+        state.running = False
         log("Pipeline finished", "info", "ORCHESTRATOR")
     except Exception as e:
+        state.message = f"pipeline_error: {e}"
+        state.running = False
         log(f"Pipeline error: {e}\n{traceback.format_exc()}", "error", "ORCHESTRATOR")
 @app.route("/")
 def index():
+    return send_from_directory(".", "index.html")
 @app.get("/api/system/status")
 def system_status():
     return jsonify({
+        "gmail": gmail_service is not None,
+        "groups_file_exists": os.path.exists(GROUPS_TXT),
+        "groups_count": len(read_groups(GROUPS_TXT)),
+        "scrape_outdir": SCRAPE_OUTDIR,
+        "analysis_outdir": ANALYSIS_OUTDIR,
+        "sender_email": SENDER_EMAIL,
+        "final5_exists": os.path.exists(FINAL5_PATH),
+        "gemini_keys_count": len(GEMINI_KEYS)
     })
 @app.get("/api/groups")
 @app.post("/api/process/start")
 def api_process_start():
+    if state.running:
+        return jsonify({"success": False, "message": "Already running"}), 409
     data = request.json or {}
     recips = data.get("recipients") or [SENDER_EMAIL]
+    if isinstance(recips, str):
+        recips = [e.strip() for e in recips.split(",") if e.strip()]
     threading.Thread(target=run_pipeline, args=(recips,), daemon=True).start()
     log(f"Start requested by client; recipients={recips}", "info", "API")
     return jsonify({"success": True, "message": "Pipeline started", "recipients": recips})
 @app.get("/api/process/status")
 def api_process_status():
+    return jsonify({
+        "running": state.running,
+        "message": state.message,
+        "progress": state.progress,
+        "current": state.current,
+        "total": state.total,
+        "groups": [g.__dict__ for g in state.groups]
+    })
 @app.get("/api/process/logs")
 def api_process_logs():
 @app.get("/api/live/state")
 def api_live_state():
+    with live_lock:
+        return jsonify({"success": True, "data": live_state})
 @app.get("/api/results/summary")
 def api_results_summary():
     p = state.summary_path or os.path.join(ANALYSIS_OUTDIR, "analysis_summary.json")
+    if not os.path.exists(p):
+        return jsonify({"success": False, "message": "No summary yet"}), 404
+    with open(p, "r", encoding="utf-8") as f:
+        return jsonify({"success": True, "data": json.load(f)})
 @app.get("/api/recipients")
 def api_get_recipients():
     recipients_path = "recipients.json"
+    if not os.path.exists(recipients_path):
+        return jsonify({"success": False, "message": "recipients.json not found"}), 404
     try:
+        with open(recipients_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        if not isinstance(data, list):
+            return jsonify({"success": False, "message": "Invalid format"}), 500
         return jsonify({"success": True, "data": data})
     except Exception as e:
         return jsonify({"success": False, "message": f"Error reading file: {str(e)}"}), 500
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", 7860))
+    app.run(host="0.0.0.0", port=port)

final5.py CHANGED Viewed

@@ -1,5 +1,13 @@
-# final5.py
-import os, re, sys, time, json, base64, pickle, argparse, traceback, shutil
 from typing import List, Dict, Any, Tuple
 from datetime import datetime
 import tempfile
@@ -11,16 +19,16 @@ except Exception:
     pass
 from selenium import webdriver
-from selenium.webdriver.chrome.service import Service as ChromeService
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import (
-    StaleElementReferenceException, NoSuchElementException, TimeoutException
-)
 from google.oauth2 import service_account
 from googleapiclient.discovery import build
 from googleapiclient.errors import HttpError
 import google.generativeai as genai
 from google.api_core.exceptions import ResourceExhausted
@@ -35,20 +43,18 @@ def get_args():
     p.add_argument("--recipients", default="")
     p.add_argument("--sender", default=os.environ.get("SENDER_EMAIL", ""))
     p.add_argument("--cookies-file", default=os.path.join(WRITABLE_DIR, "facebook_cookies.pkl"))
-    p.add_argument("--max-scrolls", type=int, default=int(os.environ.get("MAX_SCROLLS","5")))
-    p.add_argument("--scroll-pause", type=float, default=float(os.environ.get("SCROLL_PAUSE","3")))
     p.add_argument("--gemini-keys", default="")
-    p.add_argument("--headless", action="store_true", help="Prefer headless browser")
     return p.parse_args()
 def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
     options = webdriver.ChromeOptions()
     user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
-    options.binary_location = "/usr/bin/google-chrome"
     options.add_argument(f"--user-data-dir={user_data_dir}")
-    options.add_argument("--headless=new")
     options.add_argument("--no-sandbox")
     options.add_argument("--disable-dev-shm-usage")
     options.add_argument("--disable-gpu")
@@ -57,27 +63,34 @@ def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
     options.add_argument("--disable-extensions")
     options.add_argument("--remote-debugging-port=9222")
     options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
-    service = ChromeService(executable_path="/usr/bin/chromedriver")
     driver = webdriver.Chrome(service=service, options=options)
-    print("[SELENIUM] WebDriver session created successfully using system binaries.")
     return driver, user_data_dir
 def build_gmail_service():
     if os.path.exists(SERVICE_ACCOUNT_FILE):
         try:
             sender_email = os.environ.get("SENDER_EMAIL")
-            if not sender_email: return None
             credentials = service_account.Credentials.from_service_account_file(
-                SERVICE_ACCOUNT_FILE, scopes=["https://www.googleapis.com/auth/gmail.send"]).with_subject(sender_email)
             return build("gmail", "v1", credentials=credentials)
         except Exception as e:
             print(f"[GMAIL] Auth failed in final5.py: {e}")
     return None
 GEMINI_MODEL = "gemini-1.5-flash"
 class GeminiManager:
     def __init__(self, api_keys: List[str]):
         self.api_keys = api_keys
@@ -122,9 +135,19 @@ class GeminiManager:
             else:
                 raise e
-def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
-    fallback = { "is_medical_seeking": False, "confidence": "low", "medical_summary": "AI unavailable", "suggested_services": [], "urgency_level": "low", "analysis": "Fallback", "reasoning": "AI error", "matched_keywords": found_keywords }
-    if not gemini_manager or not gemini_manager.is_available(): return fallback
     keywords_str = ", ".join(found_keywords) if found_keywords else "none"
     prompt = f"""Analyze this social post to determine if the author is seeking medical help for a personal health need.
 KEYWORDS: {keywords_str}
@@ -135,19 +158,25 @@ RULES:
 Post: "{post_text}"
 Return ONLY JSON:
 {{
-  "is_medical_seeking": true/false, "confidence": "high/medium/low", "medical_summary": "short summary",
-  "suggested_services": ["service1","service2"], "urgency_level": "high/medium/low",
-  "analysis": "why it's seeking help", "reasoning": "short explanation", "matched_keywords": ["keyword1"]
 }}"""
     for _ in range(2):
         try:
             resp = gemini_manager.generate_content(prompt)
-            txt = (resp.text or "").strip()
             s, e = txt.find("{"), txt.rfind("}") + 1
             if s >= 0 and e > s:
                 result = json.loads(txt[s:e])
                 result["is_medical_seeking"] = bool(result.get("is_medical_seeking", False))
-                if "matched_keywords" not in result: result["matched_keywords"] = found_keywords
                 return result
             return fallback
         except Exception as e:
@@ -155,7 +184,14 @@ Return ONLY JSON:
             gemini_manager.rotate_key()
     return fallback
-MEDICAL_KEYWORDS = [ "doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care","emergency","er","specialist","pediatrician","dentist","gynecologist","obgyn","women's health","health center","family doctor","maternity","prenatal","postnatal","labor","delivery","need doctor","looking for doctor","find doctor","recommend doctor","medical help","health help","appointment","checkup","treatment","prescription","medicine","surgery","best hospital","best clinic","where to go","doctor recommendation","pregnancy","birth control","contraception","fertility","hillside","medical group","wellness center" ]
 def contains_keywords(text: str) -> Tuple[bool, List[str]]:
     tl = (text or "").lower()
@@ -166,22 +202,20 @@ def load_cookies(driver, cookies_file: str):
     print("[FB] Navigating to Facebook homepage to load cookies...")
     driver.get("https://www.facebook.com")
     time.sleep(2)
     if not os.path.exists(cookies_file):
         raise RuntimeError(f"[FB] FATAL: Cookies file not found at {cookies_file}")
     with open(cookies_file, "rb") as f:
         cookies = pickle.load(f)
     for cookie in cookies:
-        if "sameSite" in cookie and cookie["sameSite"] not in ["Strict","Lax","None"]:
             cookie["sameSite"] = "Lax"
-        driver.add_cookie(cookie)
     print("[FB] All cookies loaded. Refreshing page to apply session...")
     driver.refresh()
     time.sleep(5)
     if "log in" in driver.title.lower():
         print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
     else:
@@ -204,14 +238,15 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
         print(f"[SCRAPE] --- Scroll {s+1}/{max_scrolls} ---")
         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         time.sleep(pause)
         divs = driver.find_elements(By.XPATH, "//div[@role='article']")
         added_this_scroll = 0
         for d in divs:
             try:
                 txt = (d.text or "").strip()
-                if len(txt) < 25 or txt in seen: continue
-                if any(ui in txt for ui in ["Comment Share", "Write a comment...", "View more comments"]): continue
                 seen.add(txt)
                 posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
                 added_this_scroll += 1
@@ -227,7 +262,7 @@ def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int
     posts = []
     try:
         driver, user_data_dir = new_driver(headless=True)
-        wait = WebDriverWait(driver, 20)
         load_cookies(driver, cookies_file)
         posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
     except Exception as e:
@@ -235,8 +270,10 @@ def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int
         raise
     finally:
         if driver:
-            try: driver.quit()
-            except Exception: pass
         if user_data_dir and os.path.exists(user_data_dir):
             try:
                 shutil.rmtree(user_data_dir, ignore_errors=True)
@@ -249,42 +286,39 @@ def main():
     args = get_args()
     os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
     os.makedirs(os.path.dirname(args.analysis_out) or ".", exist_ok=True)
     gemini_keys = [k.strip() for k in args.gemini_keys.split(",") if k.strip()] if args.gemini_keys else []
     gemini_manager = GeminiManager(gemini_keys)
-    posts = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
     with open(args.out, "w", encoding="utf-8") as f:
         json.dump(posts, f, ensure_ascii=False, indent=2)
     print(f"[SCRAPE] Saved {len(posts)} scraped posts to {args.out}")
     print(f"::SCRAPE_SAVED::{args.out}")
     keyword_hits, confirmed = [], []
     for p in posts:
-        has, hits = contains_keywords(p.get("text",""))
         if has:
             p["found_keywords"] = hits
             keyword_hits.append(p)
             print(f"::KW_HIT::{json.dumps({'id': p['id'], 'found_keywords': hits}, ensure_ascii=False)}")
     per_call_sleep = 5
     for idx, p in enumerate(keyword_hits, start=1):
         found_kws = p.get("found_keywords", [])
-        ai = ai_medical_intent(gemini_manager, p.get("text",""), found_kws)
         p["ai_analysis"] = ai
         print(f"::AI_RESULT::{json.dumps({'id': p['id'], 'ai': ai}, ensure_ascii=False)}")
         if ai.get("is_medical_seeking"):
             confirmed.append(p)
         if idx < len(keyword_hits):
             time.sleep(per_call_sleep)
     report = {
-        "analysis_date": datetime.now().isoformat(), "group_link": args.group,
-        "total_posts": len(posts), "keyword_hits": len(keyword_hits),
-        "confirmed_medical": len(confirmed), "emails_sent": 0, "posts": confirmed
     }
     with open(args.analysis_out, "w", encoding="utf-8") as f:
         json.dump(report, f, ensure_ascii=False, indent=2)
     print(f"[ANALYSIS] Saved analysis to {args.analysis_out}")

+import os
+import re
+import sys
+import time
+import json
+import base64
+import pickle
+import argparse
+import traceback
+import shutil
 from typing import List, Dict, Any, Tuple
 from datetime import datetime
 import tempfile
     pass
 from selenium import webdriver
 from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service as ChromeService
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException, TimeoutException
 from google.oauth2 import service_account
 from googleapiclient.discovery import build
 from googleapiclient.errors import HttpError
 import google.generativeai as genai
 from google.api_core.exceptions import ResourceExhausted
     p.add_argument("--recipients", default="")
     p.add_argument("--sender", default=os.environ.get("SENDER_EMAIL", ""))
     p.add_argument("--cookies-file", default=os.path.join(WRITABLE_DIR, "facebook_cookies.pkl"))
+    p.add_argument("--max-scrolls", type=int, default=int(os.environ.get("MAX_SCROLLS", "5")))
+    p.add_argument("--scroll-pause", type=float, default=float(os.environ.get("SCROLL_PAUSE", "3")))
     p.add_argument("--gemini-keys", default="")
+    p.add_argument("--headless", action="store_true")
     return p.parse_args()
 def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
     options = webdriver.ChromeOptions()
     user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
     options.add_argument(f"--user-data-dir={user_data_dir}")
+    if headless:
+        options.add_argument("--headless=new")
     options.add_argument("--no-sandbox")
     options.add_argument("--disable-dev-shm-usage")
     options.add_argument("--disable-gpu")
     options.add_argument("--disable-extensions")
     options.add_argument("--remote-debugging-port=9222")
     options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+    os.environ.setdefault("HOME", WRITABLE_DIR)
+    os.environ.setdefault("WDM_LOCAL", "1")
+    os.environ.setdefault("WDM_CACHE_DIR", os.path.join(WRITABLE_DIR, ".wdm"))
+    os.environ.setdefault("SE_MANAGER_DRIVER_CACHE", os.path.join(WRITABLE_DIR, "selenium"))
+    os.makedirs(os.environ["WDM_CACHE_DIR"], exist_ok=True)
+    os.makedirs(os.environ["SE_MANAGER_DRIVER_CACHE"], exist_ok=True)
+    service = ChromeService()
     driver = webdriver.Chrome(service=service, options=options)
+    print("[SELENIUM] WebDriver session created successfully.")
     return driver, user_data_dir
 def build_gmail_service():
     if os.path.exists(SERVICE_ACCOUNT_FILE):
         try:
             sender_email = os.environ.get("SENDER_EMAIL")
+            if not sender_email:
+                return None
             credentials = service_account.Credentials.from_service_account_file(
+                SERVICE_ACCOUNT_FILE,
+                scopes=["https://www.googleapis.com/auth/gmail.send"]
+            ).with_subject(sender_email)
             return build("gmail", "v1", credentials=credentials)
         except Exception as e:
             print(f"[GMAIL] Auth failed in final5.py: {e}")
     return None
 GEMINI_MODEL = "gemini-1.5-flash"
 class GeminiManager:
     def __init__(self, api_keys: List[str]):
         self.api_keys = api_keys
             else:
                 raise e
+def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str, Any]:
+    fallback = {
+        "is_medical_seeking": False,
+        "confidence": "low",
+        "medical_summary": "AI unavailable",
+        "suggested_services": [],
+        "urgency_level": "low",
+        "analysis": "Fallback",
+        "reasoning": "AI error",
+        "matched_keywords": found_keywords
+    }
+    if not gemini_manager or not gemini_manager.is_available():
+        return fallback
     keywords_str = ", ".join(found_keywords) if found_keywords else "none"
     prompt = f"""Analyze this social post to determine if the author is seeking medical help for a personal health need.
 KEYWORDS: {keywords_str}
 Post: "{post_text}"
 Return ONLY JSON:
 {{
+  "is_medical_seeking": true/false,
+  "confidence": "high/medium/low",
+  "medical_summary": "short summary",
+  "suggested_services": ["service1","service2"],
+  "urgency_level": "high/medium/low",
+  "analysis": "why it's seeking help",
+  "reasoning": "short explanation",
+  "matched_keywords": ["keyword1"]
 }}"""
     for _ in range(2):
         try:
             resp = gemini_manager.generate_content(prompt)
+            txt = (getattr(resp, "text", "") or "").strip()
             s, e = txt.find("{"), txt.rfind("}") + 1
             if s >= 0 and e > s:
                 result = json.loads(txt[s:e])
                 result["is_medical_seeking"] = bool(result.get("is_medical_seeking", False))
+                if "matched_keywords" not in result:
+                    result["matched_keywords"] = found_keywords
                 return result
             return fallback
         except Exception as e:
             gemini_manager.rotate_key()
     return fallback
+MEDICAL_KEYWORDS = [
+    "doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care","emergency","er",
+    "specialist","pediatrician","dentist","gynecologist","obgyn","women's health","health center","family doctor",
+    "maternity","prenatal","postnatal","labor","delivery","need doctor","looking for doctor","find doctor",
+    "recommend doctor","medical help","health help","appointment","checkup","treatment","prescription","medicine",
+    "surgery","best hospital","best clinic","where to go","doctor recommendation","pregnancy","birth control",
+    "contraception","fertility","hillside","medical group","wellness center"
+]
 def contains_keywords(text: str) -> Tuple[bool, List[str]]:
     tl = (text or "").lower()
     print("[FB] Navigating to Facebook homepage to load cookies...")
     driver.get("https://www.facebook.com")
     time.sleep(2)
     if not os.path.exists(cookies_file):
         raise RuntimeError(f"[FB] FATAL: Cookies file not found at {cookies_file}")
     with open(cookies_file, "rb") as f:
         cookies = pickle.load(f)
     for cookie in cookies:
+        if "sameSite" in cookie and cookie["sameSite"] not in ["Strict", "Lax", "None"]:
             cookie["sameSite"] = "Lax"
+        try:
+            driver.add_cookie(cookie)
+        except Exception:
+            pass
     print("[FB] All cookies loaded. Refreshing page to apply session...")
     driver.refresh()
     time.sleep(5)
     if "log in" in driver.title.lower():
         print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
     else:
         print(f"[SCRAPE] --- Scroll {s+1}/{max_scrolls} ---")
         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         time.sleep(pause)
         divs = driver.find_elements(By.XPATH, "//div[@role='article']")
         added_this_scroll = 0
         for d in divs:
             try:
                 txt = (d.text or "").strip()
+                if len(txt) < 25 or txt in seen:
+                    continue
+                if any(ui in txt for ui in ["Comment Share", "Write a comment...", "View more comments"]):
+                    continue
                 seen.add(txt)
                 posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
                 added_this_scroll += 1
     posts = []
     try:
         driver, user_data_dir = new_driver(headless=True)
+        wait = WebDriverWait(driver, 30)
         load_cookies(driver, cookies_file)
         posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
     except Exception as e:
         raise
     finally:
         if driver:
+            try:
+                driver.quit()
+            except Exception:
+                pass
         if user_data_dir and os.path.exists(user_data_dir):
             try:
                 shutil.rmtree(user_data_dir, ignore_errors=True)
     args = get_args()
     os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
     os.makedirs(os.path.dirname(args.analysis_out) or ".", exist_ok=True)
     gemini_keys = [k.strip() for k in args.gemini_keys.split(",") if k.strip()] if args.gemini_keys else []
     gemini_manager = GeminiManager(gemini_keys)
+    posts = try_scrape_with_fallback(args.group.strip(), args.cookies_file, args.max_scrolls, args.scroll_pause)
     with open(args.out, "w", encoding="utf-8") as f:
         json.dump(posts, f, ensure_ascii=False, indent=2)
     print(f"[SCRAPE] Saved {len(posts)} scraped posts to {args.out}")
     print(f"::SCRAPE_SAVED::{args.out}")
     keyword_hits, confirmed = [], []
     for p in posts:
+        has, hits = contains_keywords(p.get("text", ""))
         if has:
             p["found_keywords"] = hits
             keyword_hits.append(p)
             print(f"::KW_HIT::{json.dumps({'id': p['id'], 'found_keywords': hits}, ensure_ascii=False)}")
     per_call_sleep = 5
     for idx, p in enumerate(keyword_hits, start=1):
         found_kws = p.get("found_keywords", [])
+        ai = ai_medical_intent(gemini_manager, p.get("text", ""), found_kws)
         p["ai_analysis"] = ai
         print(f"::AI_RESULT::{json.dumps({'id': p['id'], 'ai': ai}, ensure_ascii=False)}")
         if ai.get("is_medical_seeking"):
             confirmed.append(p)
         if idx < len(keyword_hits):
             time.sleep(per_call_sleep)
     report = {
+        "analysis_date": datetime.now().isoformat(),
+        "group_link": args.group,
+        "total_posts": len(posts),
+        "keyword_hits": len(keyword_hits),
+        "confirmed_medical": len(confirmed),
+        "emails_sent": 0,
+        "posts": confirmed
     }
     with open(args.analysis_out, "w", encoding="utf-8") as f:
         json.dump(report, f, ensure_ascii=False, indent=2)
     print(f"[ANALYSIS] Saved analysis to {args.analysis_out}")