Spaces:

sonuprasad23
/

fb_scraper

Sleeping

App Files Files Community

sonuprasad23 commited on Sep 5, 2025

Commit

00f0b39

1 Parent(s): 5571520

Project Uploaded

Browse files

Files changed (1) hide show

final5.py +29 -86

final5.py CHANGED Viewed

@@ -14,7 +14,7 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.common.exceptions import (
-    StaleElementReferenceException, NoSuchElementException, TimeoutException, SessionNotCreatedException
 )
 from google.oauth2 import service_account
 from googleapiclient.discovery import build
@@ -39,44 +39,22 @@ def get_args():
     p.add_argument("--headless", action="store_true", help="Prefer headless browser")
     return p.parse_args()
-GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
 def build_gmail_service():
     if os.path.exists(SERVICE_ACCOUNT_FILE):
         try:
             sender_email = os.environ.get("SENDER_EMAIL")
-            if not sender_email:
-                print("[GMAIL] SENDER_EMAIL environment variable not set.")
-                return None
             credentials = service_account.Credentials.from_service_account_file(
-                SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(sender_email)
             return build("gmail", "v1", credentials=credentials)
         except Exception as e:
-            print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
     return None
-# The send_html_email function is not used by main() but is kept for modularity
-def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
-    if not service: return 0
-    from email.message import EmailMessage
-    sent = 0
-    for to in to_list:
-        try:
-            msg = EmailMessage()
-            msg["to"] = to
-            msg["from"] = sender
-            msg["subject"] = subject
-            msg.set_content(html, subtype="html")
-            raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
-            service.users().messages().send(userId="me", body={"raw": raw}).execute()
-            sent += 1
-        except Exception as e:
-            print(f"[GMAIL] send error to {to}: {e}")
-    return sent
 GEMINI_MODEL = "gemini-1.5-flash"
 class GeminiManager:
     def __init__(self, api_keys: List[str]):
         self.api_keys = api_keys
         self.current_key_index = 0
@@ -120,30 +98,25 @@ class GeminiManager:
             else:
                 raise e
 def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
-    fallback = {
-        "is_medical_seeking": False, "confidence": "low",
-        "medical_summary": "Not a medical request (AI unavailable/throttled)",
-        "suggested_services": [], "urgency_level": "low", "analysis": "Keyword-based fallback",
-        "reasoning": "short explanation", "matched_keywords": found_keywords
-    }
     if not gemini_manager or not gemini_manager.is_available(): return fallback
     keywords_str = ", ".join(found_keywords) if found_keywords else "none"
-    prompt = f"""
-Analyze this social post and decide if the author is genuinely seeking medical help, doctor/hospital recommendations, or healthcare services for PERSONAL HEALTH NEEDS (not business, donations, or casual mentions).
-KEYWORDS FOUND IN POST: {keywords_str}
-CRITICAL RULES:
-1. ONLY flag posts where someone is seeking medical care for themselves or a loved one
-2. IGNORE posts about: business services, donations, selling products, job postings, general info sharing, or academic inquiries.
-3. ONLY flag if it's a PERSONAL HEALTH NEED
 Post: "{post_text}"
 Return ONLY JSON:
 {{
   "is_medical_seeking": true/false, "confidence": "high/medium/low", "medical_summary": "short summary",
   "suggested_services": ["service1","service2"], "urgency_level": "high/medium/low",
-  "analysis": "why it's seeking help", "reasoning": "short explanation", "matched_keywords": ["keyword1", "keyword2"]
 }}"""
-    for _ in range(1, 5):
         try:
             resp = gemini_manager.generate_content(prompt)
             txt = (resp.text or "").strip()
@@ -154,31 +127,19 @@ Return ONLY JSON:
                 if "matched_keywords" not in result: result["matched_keywords"] = found_keywords
                 return result
             return fallback
-        except ResourceExhausted:
-            gemini_manager.rotate_key()
-            if not gemini_manager.is_available(): return fallback
         except Exception as e:
-            print(f"[GEMINI] error: {e}")
             gemini_manager.rotate_key()
-            if not gemini_manager.is_available(): return fallback
     return fallback
-MEDICAL_KEYWORDS = [
-    "doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care",
-    "emergency","er","specialist","pediatrician","dentist","gynecologist","obgyn","women's health",
-    "health center","family doctor","maternity","prenatal","postnatal","labor","delivery",
-    "need doctor","looking for doctor","find doctor","recommend doctor","medical help","health help",
-    "appointment","checkup","treatment","prescription","medicine","surgery","best hospital",
-    "best clinic","where to go","doctor recommendation","pregnancy","birth control","contraception",
-    "fertility","hillside","medical group","wellness center"
-]
 def contains_keywords(text: str) -> Tuple[bool, List[str]]:
     tl = (text or "").lower()
     hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
     return (len(hits) > 0, hits)
-# --- FIX #1: The Definitive Solution for the Selenium Crash ---
 def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
     options = webdriver.ChromeOptions()
@@ -192,7 +153,7 @@ def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
     options.add_argument(f"--user-data-dir={user_data_dir}")
     options.add_argument("--headless=new")
     options.add_argument("--no-sandbox")
-    options.add_argument("--disable-dev-shm-usage") # CRITICAL: THIS IS THE FIX
     options.add_argument("--disable-gpu")
     options.add_argument("--disable-notifications")
     options.add_argument("--window-size=1920,1080")
@@ -201,7 +162,6 @@ def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
     print("[SELENIUM] WebDriver session created successfully.")
     return driver, user_data_dir
-# --- FIX #2: Add Better Logging to the Login Process ---
 def load_cookies(driver, cookies_file: str):
     print("[FB] Navigating to Facebook homepage to load cookies...")
     driver.get("https://www.facebook.com")
@@ -222,7 +182,6 @@ def load_cookies(driver, cookies_file: str):
     driver.refresh()
     time.sleep(5)
-    # Check for login success by looking for a keyword in the title
     if "log in" in driver.title.lower():
         print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
     else:
@@ -230,24 +189,12 @@ def load_cookies(driver, cookies_file: str):
 def wait_group_feed(driver, wait):
     wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
-    feed_loaded = False
-    start = time.time(); timeout = 30
-    while not feed_loaded and (time.time() - start) < timeout:
-        try:
-            driver.find_element(By.XPATH, "//div[@data-pagelet='GroupFeed' or @role='feed']")
-            feed_loaded = True; break
-        except NoSuchElementException:
-            try:
-                driver.find_element(By.XPATH, "//div[@role='article']")
-                feed_loaded = True; break
-            except NoSuchElementException: pass
-        time.sleep(1)
-    if not feed_loaded:
         raise TimeoutException("Timed out waiting for group feed to load.")
-def find_message_nodes(driver):
-    return driver.find_elements(By.XPATH, "//div[@role='article']")
 def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
     print(f"[SCRAPE] Navigating to group: {group_url}")
     driver.get(group_url)
@@ -258,17 +205,13 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         time.sleep(pause)
-        divs = find_message_nodes(driver)
         added_this_scroll = 0
         for d in divs:
             try:
                 txt = (d.text or "").strip()
                 if len(txt) < 25 or txt in seen: continue
-                # Filter out common UI text that gets scraped as a post
-                if any(ui_text in txt for ui_text in ["Comment Share", "Write a comment...", "View more comments"]):
-                    continue
                 seen.add(txt)
                 posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
                 added_this_scroll += 1
@@ -278,7 +221,6 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
     print(f"[SCRAPE] Finished scraping. Total unique posts found: {len(posts)}")
     return posts
-# --- FIX #3: Make the Script Fail Properly on Critical Errors ---
 def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
     driver = None
     user_data_dir = None
@@ -290,8 +232,7 @@ def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int
         posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
     except Exception as e:
         print(f"[SCRAPE] FATAL ERROR during scraping: {e}")
-        # Re-raise the exception to make the script exit with a non-zero code
-        raise
     finally:
         if driver:
             try: driver.quit()
@@ -303,6 +244,8 @@ def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int
             except Exception as e:
                 print(f"[SELENIUM] Error cleaning up directory {user_data_dir}: {e}")
     return posts
 def main():
     args = get_args()

 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.common.exceptions import (
+    StaleElementReferenceException, NoSuchElementException, TimeoutException
 )
 from google.oauth2 import service_account
 from googleapiclient.discovery import build
     p.add_argument("--headless", action="store_true", help="Prefer headless browser")
     return p.parse_args()
+# This function is not called in the main flow but kept for modularity
 def build_gmail_service():
     if os.path.exists(SERVICE_ACCOUNT_FILE):
         try:
             sender_email = os.environ.get("SENDER_EMAIL")
+            if not sender_email: return None
             credentials = service_account.Credentials.from_service_account_file(
+                SERVICE_ACCOUNT_FILE, scopes=["https://www.googleapis.com/auth/gmail.send"]).with_subject(sender_email)
             return build("gmail", "v1", credentials=credentials)
         except Exception as e:
+            print(f"[GMAIL] Auth failed in final5.py: {e}")
     return None
 GEMINI_MODEL = "gemini-1.5-flash"
 class GeminiManager:
+    # ... (This class is correct, no changes needed)
     def __init__(self, api_keys: List[str]):
         self.api_keys = api_keys
         self.current_key_index = 0
             else:
                 raise e
 def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
+    fallback = { "is_medical_seeking": False, "confidence": "low", "medical_summary": "AI unavailable", "suggested_services": [], "urgency_level": "low", "analysis": "Fallback", "reasoning": "AI error", "matched_keywords": found_keywords }
     if not gemini_manager or not gemini_manager.is_available(): return fallback
     keywords_str = ", ".join(found_keywords) if found_keywords else "none"
+    prompt = f"""Analyze this social post to determine if the author is seeking medical help for a personal health need.
+KEYWORDS: {keywords_str}
+RULES:
+1. Flag ONLY posts where someone seeks medical care for themselves or a loved one.
+2. IGNORE posts about business, donations, selling products, jobs, or general info.
+3. Flag ONLY if it is a PERSONAL HEALTH NEED.
 Post: "{post_text}"
 Return ONLY JSON:
 {{
   "is_medical_seeking": true/false, "confidence": "high/medium/low", "medical_summary": "short summary",
   "suggested_services": ["service1","service2"], "urgency_level": "high/medium/low",
+  "analysis": "why it's seeking help", "reasoning": "short explanation", "matched_keywords": ["keyword1"]
 }}"""
+    for _ in range(2): # Reduced retries for speed
         try:
             resp = gemini_manager.generate_content(prompt)
             txt = (resp.text or "").strip()
                 if "matched_keywords" not in result: result["matched_keywords"] = found_keywords
                 return result
             return fallback
         except Exception as e:
+            print(f"[GEMINI] Error: {e}")
             gemini_manager.rotate_key()
     return fallback
+MEDICAL_KEYWORDS = [ "doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care","emergency","er","specialist","pediatrician","dentist","gynecologist","obgyn","women's health","health center","family doctor","maternity","prenatal","postnatal","labor","delivery","need doctor","looking for doctor","find doctor","recommend doctor","medical help","health help","appointment","checkup","treatment","prescription","medicine","surgery","best hospital","best clinic","where to go","doctor recommendation","pregnancy","birth control","contraception","fertility","hillside","medical group","wellness center" ]
 def contains_keywords(text: str) -> Tuple[bool, List[str]]:
     tl = (text or "").lower()
     hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
     return (len(hits) > 0, hits)
+# --- START: CRITICAL SELENIUM FIXES ---
 def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
     options = webdriver.ChromeOptions()
     options.add_argument(f"--user-data-dir={user_data_dir}")
     options.add_argument("--headless=new")
     options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage") # THIS IS THE KEY FIX
     options.add_argument("--disable-gpu")
     options.add_argument("--disable-notifications")
     options.add_argument("--window-size=1920,1080")
     print("[SELENIUM] WebDriver session created successfully.")
     return driver, user_data_dir
 def load_cookies(driver, cookies_file: str):
     print("[FB] Navigating to Facebook homepage to load cookies...")
     driver.get("https://www.facebook.com")
     driver.refresh()
     time.sleep(5)
     if "log in" in driver.title.lower():
         print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
     else:
 def wait_group_feed(driver, wait):
     wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
+    try:
+        wait.until(EC.presence_of_element_located((By.XPATH, "//div[@role='feed' or @data-pagelet='GroupFeed']")))
+        print("[SCRAPE] Group feed detected.")
+    except TimeoutException:
         raise TimeoutException("Timed out waiting for group feed to load.")
 def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
     print(f"[SCRAPE] Navigating to group: {group_url}")
     driver.get(group_url)
         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         time.sleep(pause)
+        divs = driver.find_elements(By.XPATH, "//div[@role='article']")
         added_this_scroll = 0
         for d in divs:
             try:
                 txt = (d.text or "").strip()
                 if len(txt) < 25 or txt in seen: continue
+                if any(ui in txt for ui in ["Comment Share", "Write a comment...", "View more comments"]): continue
                 seen.add(txt)
                 posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
                 added_this_scroll += 1
     print(f"[SCRAPE] Finished scraping. Total unique posts found: {len(posts)}")
     return posts
 def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
     driver = None
     user_data_dir = None
         posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
     except Exception as e:
         print(f"[SCRAPE] FATAL ERROR during scraping: {e}")
+        raise # Re-raise the exception to make the script exit with a non-zero code
     finally:
         if driver:
             try: driver.quit()
             except Exception as e:
                 print(f"[SELENIUM] Error cleaning up directory {user_data_dir}: {e}")
     return posts
+# --- END: CRITICAL SELENIUM FIXES ---
 def main():
     args = get_args()