Spaces:

sonuprasad23
/

fb_scraper

Sleeping

App Files Files Community

sonuprasad23 commited on Sep 5, 2025

Commit

5571520

1 Parent(s): f58cab6

Project Uploaded

Browse files

Files changed (1) hide show

final5.py +103 -201

final5.py CHANGED Viewed

@@ -55,10 +55,9 @@ def build_gmail_service():
             print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
     return None
 def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
-    if not service:
-        print("[GMAIL] service not available; skipping email")
-        return 0
     from email.message import EmailMessage
     sent = 0
     for to in to_list:
@@ -71,8 +70,6 @@ def send_html_email(service, sender: str, to_list: List[str], subject: str, html
             raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
             service.users().messages().send(userId="me", body={"raw": raw}).execute()
             sent += 1
-        except HttpError as e:
-            print(f"[GMAIL] http error to {to}: {e}")
         except Exception as e:
             print(f"[GMAIL] send error to {to}: {e}")
     return sent
@@ -123,55 +120,30 @@ class GeminiManager:
             else:
                 raise e
-def parse_retry_seconds_from_error(err: Exception) -> int:
-    s = str(err)
-    m1 = re.search(r"retry[_ ]delay\s*\{\s*seconds:\s*(\d+)", s, re.IGNORECASE)
-    if m1: return int(m1.group(1))
-    m2 = re.search(r'"retryDelay"\s*:\s*"(\d+)s"', s)
-    if m2: return int(m2.group(1))
-    return 45
 def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
     fallback = {
-        "is_medical_seeking": False,
-        "confidence": "low",
         "medical_summary": "Not a medical request (AI unavailable/throttled)",
-        "suggested_services": [],
-        "urgency_level": "low",
-        "analysis": "Keyword-based fallback",
-        "reasoning": "short explanation",
-        "matched_keywords": found_keywords
     }
-    if not gemini_manager or not gemini_manager.is_available():
-        return fallback
     keywords_str = ", ".join(found_keywords) if found_keywords else "none"
     prompt = f"""
 Analyze this social post and decide if the author is genuinely seeking medical help, doctor/hospital recommendations, or healthcare services for PERSONAL HEALTH NEEDS (not business, donations, or casual mentions).
 KEYWORDS FOUND IN POST: {keywords_str}
 CRITICAL RULES:
 1. ONLY flag posts where someone is seeking medical care for themselves or a loved one
-2. IGNORE posts about:
-   - Business services (e.g., "Looking for a doctor for my clinic")
-   - Donations or fundraising (e.g., "Raising money for surgery")
-   - Selling medical products
-   - Job postings for medical professionals
-   - General health information sharing
-   - Research or academic inquiries
 3. ONLY flag if it's a PERSONAL HEALTH NEED
 Post: "{post_text}"
 Return ONLY JSON:
 {{
-  "is_medical_seeking": true/false,
-  "confidence": "high/medium/low",
-  "medical_summary": "short summary",
-  "suggested_services": ["service1","service2"],
-  "urgency_level": "high/medium/low",
-  "analysis": "why it's seeking help",
-  "reasoning": "short explanation",
-  "matched_keywords": ["keyword1", "keyword2"]
-}}
-"""
-    for attempt in range(1, 5):
         try:
             resp = gemini_manager.generate_content(prompt)
             txt = (resp.text or "").strip()
@@ -179,36 +151,26 @@ Return ONLY JSON:
             if s >= 0 and e > s:
                 result = json.loads(txt[s:e])
                 result["is_medical_seeking"] = bool(result.get("is_medical_seeking", False))
-                if "matched_keywords" not in result:
-                    result["matched_keywords"] = found_keywords
                 return result
             return fallback
-        except ResourceExhausted as e:
-            wait_s = min(parse_retry_seconds_from_error(e) + 2, 120)
-            print(f"[GEMINI] 429 rate limit; backoff {wait_s}s (attempt {attempt}/4)")
-            time.sleep(wait_s)
-            if gemini_manager.is_available():
-                continue
-            else:
-                return fallback
         except Exception as e:
             print(f"[GEMINI] error: {e}")
             gemini_manager.rotate_key()
-            if not gemini_manager.is_available():
-                return fallback
     return fallback
 MEDICAL_KEYWORDS = [
-    "doctor","physician","primary care","healthcare","medical","clinic","hospital",
-    "urgent care","emergency","er","specialist","pediatrician","dentist",
-    "gynecologist","obgyn","women's health","health center","family doctor",
-    "maternity","prenatal","postnatal","labor","delivery",
-    "need doctor","looking for doctor","find doctor","recommend doctor",
-    "medical help","health help","appointment","checkup","treatment",
-    "prescription","medicine","surgery","best hospital","best clinic",
-    "where to go","doctor recommendation",
-    "pregnancy","birth control","contraception","fertility",
-    "hillside","medical group","wellness center"
 ]
 def contains_keywords(text: str) -> Tuple[bool, List[str]]:
@@ -216,67 +178,55 @@ def contains_keywords(text: str) -> Tuple[bool, List[str]]:
     hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
     return (len(hits) > 0, hits)
-# --- FIX: Return the user_data_dir for explicit cleanup ---
 def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
     options = webdriver.ChromeOptions()
     cache_path = os.path.join(WRITABLE_DIR, "selenium")
     os.makedirs(cache_path, exist_ok=True)
     os.environ["SE_CACHE_PATH"] = cache_path
     user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
-    options.add_argument(f"--user-data-dir={user_data_dir}")
     options.add_argument("--disable-notifications")
-    options.add_argument("--disable-web-security")
-    options.add_argument("--disable-features=IsolateOrigins,site-per-process")
-    options.add_argument("--disable-blink-features=AutomationControlled")
-    options.add_experimental_option("useAutomationExtension", False)
-    options.add_experimental_option("excludeSwitches", ["enable-automation"])
     options.add_argument("--window-size=1920,1080")
-    options.add_argument("--lang=en-US,en")
-    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36")
-    if headless:
-        options.add_argument("--headless=new")
-        options.add_argument("--disable-gpu")
-        options.add_argument("--disable-dev-shm-usage")
-        options.add_argument("--no-sandbox")
-        options.add_argument("--disable-extensions")
-        options.add_argument("--disable-plugins")
-        options.add_argument("--disable-images")
     driver = webdriver.Chrome(options=options)
-    try:
-        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
-            "source": "Object.defineProperty(navigator, 'webdriver', { get: () => undefined });"
-        })
-    except Exception:
-        pass
     return driver, user_data_dir
 def load_cookies(driver, cookies_file: str):
-    print("[FB] Loading Facebook homepage...")
     driver.get("https://www.facebook.com")
-    time.sleep(3)
-    try:
-        with open(cookies_file, "rb") as f:
-            cookies = pickle.load(f)
-        for cookie in cookies:
-            if "sameSite" in cookie and cookie["sameSite"] not in ["Strict","Lax","None"]:
-                cookie["sameSite"] = "Lax"
-            try:
-                driver.add_cookie(cookie)
-            except Exception as e:
-                print(f"Could not add cookie: {e}")
-        print("[FB] Cookies loaded. Refreshing page...")
-        driver.refresh()
-        time.sleep(5)
-    except FileNotFoundError:
-        raise RuntimeError(f"[FB] Cookies file not found: {cookies_file}")
-    except Exception as e:
-        raise RuntimeError(f"[FB] Cookie load error: {e}")
 def wait_group_feed(driver, wait):
     wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
@@ -290,93 +240,68 @@ def wait_group_feed(driver, wait):
             try:
                 driver.find_element(By.XPATH, "//div[@role='article']")
                 feed_loaded = True; break
-            except NoSuchElementException:
-                pass
         time.sleep(1)
     if not feed_loaded:
-        raise TimeoutException("Timed out waiting for group feed")
 def find_message_nodes(driver):
-    nodes = driver.find_elements(By.XPATH, "//div[@data-ad-preview='message']")
-    if nodes: return nodes
-    nodes = driver.find_elements(By.XPATH, "//div[@data-ad-comet-preview='message']")
-    if nodes: return nodes
-    return driver.find_elements(By.XPATH, "//div[@role='article']//div[@dir='auto' and string-length(normalize-space())>0]")
 def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
     print(f"[SCRAPE] Navigating to group: {group_url}")
     driver.get(group_url)
     wait_group_feed(driver, wait)
-    posts, seen, rects = [], set(), set()
-    total = 0
     for s in range(max_scrolls):
         print(f"[SCRAPE] --- Scroll {s+1}/{max_scrolls} ---")
         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
-        try:
-            wait.until(lambda d: d.execute_script("return document.readyState") == "complete")
-        except Exception:
-            pass
         time.sleep(pause)
-        try:
-            divs = find_message_nodes(driver)
-            print(f"[SCRAPE] Nodes found: {len(divs)}")
-        except Exception as e:
-            print(f"[SCRAPE] find error: {e}")
-            continue
-        added = 0
-        for i, d in enumerate(divs):
-            try:
-                rect = (d.rect.get('x'), d.rect.get('y'), d.rect.get('width'), d.rect.get('height'))
-                if rect in rects: continue
-                rects.add(rect)
-            except Exception:
-                pass
             try:
                 txt = (d.text or "").strip()
-                if len(txt) < 20:
-                    try:
-                        art = d.find_element(By.XPATH, "ancestor::div[@role='article']")
-                        txt = (art.text or "").strip()
-                    except Exception:
-                        pass
             except StaleElementReferenceException:
                 continue
-            if not txt or len(txt) < 20: continue
-            if txt in seen: continue
-            wc = len(re.findall(r"\b\w+\b", txt))
-            if wc > 7 and not any(j in txt for j in ["LikeCommentShare","Write a comment","View more comments"]):
-                seen.add(txt)
-                total += 1
-                posts.append({"id": total, "text": txt, "group_link": group_url})
-                added += 1
-        print(f"[SCRAPE] New posts this scroll: {added}")
-    print(f"[SCRAPE] Total unique posts: {total}")
     return posts
-# --- FIX: Robust cleanup of the driver and its user data directory ---
 def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
     driver = None
     user_data_dir = None
     posts = []
     try:
         driver, user_data_dir = new_driver(headless=True)
-        wait = WebDriverWait(driver, 15)
         load_cookies(driver, cookies_file)
         posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
     except Exception as e:
-        print(f"[SCRAPE] Error in headless mode: {e}")
     finally:
         if driver:
-            try:
-                driver.quit()
-            except Exception as e:
-                print(f"Error during driver.quit(): {e}")
         if user_data_dir and os.path.exists(user_data_dir):
             try:
                 shutil.rmtree(user_data_dir, ignore_errors=True)
-                print(f"Cleaned up user data directory: {user_data_dir}")
             except Exception as e:
-                print(f"Error cleaning up user data directory {user_data_dir}: {e}")
     return posts
 def main():
@@ -384,29 +309,15 @@ def main():
     os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
     os.makedirs(os.path.dirname(args.analysis_out) or ".", exist_ok=True)
-    gemini_keys = []
-    if args.gemini_keys:
-        gemini_keys = [k.strip() for k in args.gemini_keys.split(",") if k.strip()]
-    else:
-        for i in range(1, 6):
-            key = os.environ.get(f"GEMINI_API_KEY_{i}")
-            if key:
-                gemini_keys.append(key)
-    gemini_manager = GeminiManager(gemini_keys) if gemini_keys else None
-    # This is not used to send mail, just to confirm auth is possible.
-    _ = build_gmail_service()
-    # Call the modified function which now returns only posts.
     posts = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
-    try:
-        with open(args.out, "w", encoding="utf-8") as f:
-            json.dump(posts, f, ensure_ascii=False, indent=2)
-        print(f"[SCRAPE] Saved scraped posts to {args.out}")
-        print(f"::SCRAPE_SAVED::{args.out}")
-    except Exception as e:
-        print(f"[SCRAPE] Error saving posts: {e}")
     keyword_hits, confirmed = [], []
     for p in posts:
@@ -416,8 +327,7 @@ def main():
             keyword_hits.append(p)
             print(f"::KW_HIT::{json.dumps({'id': p['id'], 'found_keywords': hits}, ensure_ascii=False)}")
-    per_call_sleep = 7
-    analyzed_posts = []
     for idx, p in enumerate(keyword_hits, start=1):
         found_kws = p.get("found_keywords", [])
         ai = ai_medical_intent(gemini_manager, p.get("text",""), found_kws)
@@ -425,32 +335,24 @@ def main():
         print(f"::AI_RESULT::{json.dumps({'id': p['id'], 'ai': ai}, ensure_ascii=False)}")
         if ai.get("is_medical_seeking"):
             confirmed.append(p)
-            analyzed_posts.append(p)
         if idx < len(keyword_hits):
             time.sleep(per_call_sleep)
     report = {
-        "analysis_date": datetime.now().isoformat(),
-        "group_link": args.group,
-        "total_posts": len(posts),
-        "keyword_hits": len(keyword_hits),
-        "confirmed_medical": len(confirmed),
-        "emails_sent": 0,
-        "posts": confirmed
     }
-    try:
-        with open(args.analysis_out, "w", encoding="utf-8") as f:
-            json.dump(report, f, ensure_ascii=False, indent=2)
-        print(f"[ANALYSIS] Saved analysis to {args.analysis_out}")
-        print(f"::ANALYSIS_SAVED::{args.analysis_out}")
-    except Exception as e:
-        print(f"[ANALYSIS] Error saving analysis: {e}")
 if __name__ == "__main__":
     try:
         main()
-    except Exception as e:
-        print(f"Unhandled error in main: {e}")
-        print(traceback.format_exc())
-        raise

             print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
     return None
+# The send_html_email function is not used by main() but is kept for modularity
 def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
+    if not service: return 0
     from email.message import EmailMessage
     sent = 0
     for to in to_list:
             raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
             service.users().messages().send(userId="me", body={"raw": raw}).execute()
             sent += 1
         except Exception as e:
             print(f"[GMAIL] send error to {to}: {e}")
     return sent
             else:
                 raise e
 def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
     fallback = {
+        "is_medical_seeking": False, "confidence": "low",
         "medical_summary": "Not a medical request (AI unavailable/throttled)",
+        "suggested_services": [], "urgency_level": "low", "analysis": "Keyword-based fallback",
+        "reasoning": "short explanation", "matched_keywords": found_keywords
     }
+    if not gemini_manager or not gemini_manager.is_available(): return fallback
     keywords_str = ", ".join(found_keywords) if found_keywords else "none"
     prompt = f"""
 Analyze this social post and decide if the author is genuinely seeking medical help, doctor/hospital recommendations, or healthcare services for PERSONAL HEALTH NEEDS (not business, donations, or casual mentions).
 KEYWORDS FOUND IN POST: {keywords_str}
 CRITICAL RULES:
 1. ONLY flag posts where someone is seeking medical care for themselves or a loved one
+2. IGNORE posts about: business services, donations, selling products, job postings, general info sharing, or academic inquiries.
 3. ONLY flag if it's a PERSONAL HEALTH NEED
 Post: "{post_text}"
 Return ONLY JSON:
 {{
+  "is_medical_seeking": true/false, "confidence": "high/medium/low", "medical_summary": "short summary",
+  "suggested_services": ["service1","service2"], "urgency_level": "high/medium/low",
+  "analysis": "why it's seeking help", "reasoning": "short explanation", "matched_keywords": ["keyword1", "keyword2"]
+}}"""
+    for _ in range(1, 5):
         try:
             resp = gemini_manager.generate_content(prompt)
             txt = (resp.text or "").strip()
             if s >= 0 and e > s:
                 result = json.loads(txt[s:e])
                 result["is_medical_seeking"] = bool(result.get("is_medical_seeking", False))
+                if "matched_keywords" not in result: result["matched_keywords"] = found_keywords
                 return result
             return fallback
+        except ResourceExhausted:
+            gemini_manager.rotate_key()
+            if not gemini_manager.is_available(): return fallback
         except Exception as e:
             print(f"[GEMINI] error: {e}")
             gemini_manager.rotate_key()
+            if not gemini_manager.is_available(): return fallback
     return fallback
 MEDICAL_KEYWORDS = [
+    "doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care",
+    "emergency","er","specialist","pediatrician","dentist","gynecologist","obgyn","women's health",
+    "health center","family doctor","maternity","prenatal","postnatal","labor","delivery",
+    "need doctor","looking for doctor","find doctor","recommend doctor","medical help","health help",
+    "appointment","checkup","treatment","prescription","medicine","surgery","best hospital",
+    "best clinic","where to go","doctor recommendation","pregnancy","birth control","contraception",
+    "fertility","hillside","medical group","wellness center"
 ]
 def contains_keywords(text: str) -> Tuple[bool, List[str]]:
     hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
     return (len(hits) > 0, hits)
+# --- FIX #1: The Definitive Solution for the Selenium Crash ---
 def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
     options = webdriver.ChromeOptions()
+    # Define writable paths inside /tmp for Selenium's cache and user data
     cache_path = os.path.join(WRITABLE_DIR, "selenium")
     os.makedirs(cache_path, exist_ok=True)
     os.environ["SE_CACHE_PATH"] = cache_path
     user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
+    # Add all necessary arguments for a stable headless run in Docker
+    options.add_argument(f"--user-data-dir={user_data_dir}")
+    options.add_argument("--headless=new")
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage") # CRITICAL: THIS IS THE FIX
+    options.add_argument("--disable-gpu")
     options.add_argument("--disable-notifications")
     options.add_argument("--window-size=1920,1080")
     driver = webdriver.Chrome(options=options)
+    print("[SELENIUM] WebDriver session created successfully.")
     return driver, user_data_dir
+# --- FIX #2: Add Better Logging to the Login Process ---
 def load_cookies(driver, cookies_file: str):
+    print("[FB] Navigating to Facebook homepage to load cookies...")
     driver.get("https://www.facebook.com")
+    time.sleep(2)
+    if not os.path.exists(cookies_file):
+        raise RuntimeError(f"[FB] FATAL: Cookies file not found at {cookies_file}")
+    with open(cookies_file, "rb") as f:
+        cookies = pickle.load(f)
+    for cookie in cookies:
+        if "sameSite" in cookie and cookie["sameSite"] not in ["Strict","Lax","None"]:
+            cookie["sameSite"] = "Lax"
+        driver.add_cookie(cookie)
+    print("[FB] All cookies loaded. Refreshing page to apply session...")
+    driver.refresh()
+    time.sleep(5)
+    # Check for login success by looking for a keyword in the title
+    if "log in" in driver.title.lower():
+        print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
+    else:
+        print(f"[FB] Login appears successful. Page title is: '{driver.title}'")
 def wait_group_feed(driver, wait):
     wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
             try:
                 driver.find_element(By.XPATH, "//div[@role='article']")
                 feed_loaded = True; break
+            except NoSuchElementException: pass
         time.sleep(1)
     if not feed_loaded:
+        raise TimeoutException("Timed out waiting for group feed to load.")
 def find_message_nodes(driver):
+    return driver.find_elements(By.XPATH, "//div[@role='article']")
 def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
     print(f"[SCRAPE] Navigating to group: {group_url}")
     driver.get(group_url)
     wait_group_feed(driver, wait)
+    posts, seen = [], set()
     for s in range(max_scrolls):
         print(f"[SCRAPE] --- Scroll {s+1}/{max_scrolls} ---")
         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         time.sleep(pause)
+        divs = find_message_nodes(driver)
+        added_this_scroll = 0
+        for d in divs:
             try:
                 txt = (d.text or "").strip()
+                if len(txt) < 25 or txt in seen: continue
+                # Filter out common UI text that gets scraped as a post
+                if any(ui_text in txt for ui_text in ["Comment Share", "Write a comment...", "View more comments"]):
+                    continue
+                seen.add(txt)
+                posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
+                added_this_scroll += 1
             except StaleElementReferenceException:
                 continue
+        print(f"[SCRAPE] Found {added_this_scroll} new, unique posts this scroll.")
+    print(f"[SCRAPE] Finished scraping. Total unique posts found: {len(posts)}")
     return posts
+# --- FIX #3: Make the Script Fail Properly on Critical Errors ---
 def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
     driver = None
     user_data_dir = None
     posts = []
     try:
         driver, user_data_dir = new_driver(headless=True)
+        wait = WebDriverWait(driver, 20)
         load_cookies(driver, cookies_file)
         posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
     except Exception as e:
+        print(f"[SCRAPE] FATAL ERROR during scraping: {e}")
+        # Re-raise the exception to make the script exit with a non-zero code
+        raise
     finally:
         if driver:
+            try: driver.quit()
+            except Exception: pass
         if user_data_dir and os.path.exists(user_data_dir):
             try:
                 shutil.rmtree(user_data_dir, ignore_errors=True)
+                print(f"[SELENIUM] Cleaned up user data directory: {user_data_dir}")
             except Exception as e:
+                print(f"[SELENIUM] Error cleaning up directory {user_data_dir}: {e}")
     return posts
 def main():
     os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
     os.makedirs(os.path.dirname(args.analysis_out) or ".", exist_ok=True)
+    gemini_keys = [k.strip() for k in args.gemini_keys.split(",") if k.strip()] if args.gemini_keys else []
+    gemini_manager = GeminiManager(gemini_keys)
     posts = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
+    with open(args.out, "w", encoding="utf-8") as f:
+        json.dump(posts, f, ensure_ascii=False, indent=2)
+    print(f"[SCRAPE] Saved {len(posts)} scraped posts to {args.out}")
+    print(f"::SCRAPE_SAVED::{args.out}")
     keyword_hits, confirmed = [], []
     for p in posts:
             keyword_hits.append(p)
             print(f"::KW_HIT::{json.dumps({'id': p['id'], 'found_keywords': hits}, ensure_ascii=False)}")
+    per_call_sleep = 5
     for idx, p in enumerate(keyword_hits, start=1):
         found_kws = p.get("found_keywords", [])
         ai = ai_medical_intent(gemini_manager, p.get("text",""), found_kws)
         print(f"::AI_RESULT::{json.dumps({'id': p['id'], 'ai': ai}, ensure_ascii=False)}")
         if ai.get("is_medical_seeking"):
             confirmed.append(p)
         if idx < len(keyword_hits):
             time.sleep(per_call_sleep)
     report = {
+        "analysis_date": datetime.now().isoformat(), "group_link": args.group,
+        "total_posts": len(posts), "keyword_hits": len(keyword_hits),
+        "confirmed_medical": len(confirmed), "emails_sent": 0, "posts": confirmed
     }
+    with open(args.analysis_out, "w", encoding="utf-8") as f:
+        json.dump(report, f, ensure_ascii=False, indent=2)
+    print(f"[ANALYSIS] Saved analysis to {args.analysis_out}")
+    print(f"::ANALYSIS_SAVED::{args.analysis_out}")
 if __name__ == "__main__":
     try:
         main()
+    except Exception:
+        # The detailed traceback is already printed in try_scrape_with_fallback
+        print("Main execution failed. Exiting with error.")
+        sys.exit(1) # Ensure a non-zero exit code on failure