Spaces:

sonuprasad23
/

fb_scraper

Sleeping

App Files Files Community

sonuprasad23 commited on Sep 5, 2025

Commit

f58cab6

1 Parent(s): 66d5034

Project Uploaded

Browse files

Files changed (1) hide show

final5.py +23 -35

final5.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os, re, sys, time, json, base64, pickle, argparse, traceback
 from typing import List, Dict, Any, Tuple
 from datetime import datetime
 import tempfile
@@ -41,28 +41,20 @@ def get_args():
 GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
-# --- FIX: Simplify this function. It's not the primary auth method. ---
 def build_gmail_service():
-    """Builds Gmail service if a service account file exists."""
     if os.path.exists(SERVICE_ACCOUNT_FILE):
         try:
             sender_email = os.environ.get("SENDER_EMAIL")
             if not sender_email:
                 print("[GMAIL] SENDER_EMAIL environment variable not set.")
                 return None
             credentials = service_account.Credentials.from_service_account_file(
                 SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(sender_email)
-            svc = build("gmail", "v1", credentials=credentials)
-            return svc
         except Exception as e:
             print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
-            return None
-    print("[GMAIL] Service account file not found in final5.py.")
     return None
-# The send_html_email function is kept for potential future direct use, but it's not called by main()
 def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
     if not service:
         print("[GMAIL] service not available; skipping email")
@@ -224,16 +216,14 @@ def contains_keywords(text: str) -> Tuple[bool, List[str]]:
     hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
     return (len(hits) > 0, hits)
-# --- FIX: Set a writable cache path for Selenium Manager and ensure unique user-data-dir ---
-def new_driver(headless: bool):
     options = webdriver.ChromeOptions()
-    # Specify a writable directory for Selenium Manager's driver cache
     cache_path = os.path.join(WRITABLE_DIR, "selenium")
     os.makedirs(cache_path, exist_ok=True)
     os.environ["SE_CACHE_PATH"] = cache_path
-    # Create a unique temporary directory for this specific Chrome instance's user data
     user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
     options.add_argument(f"--user-data-dir={user_data_dir}")
@@ -263,8 +253,8 @@ def new_driver(headless: bool):
         })
     except Exception:
         pass
-    return driver
 def load_cookies(driver, cookies_file: str):
     print("[FB] Loading Facebook homepage...")
@@ -278,8 +268,8 @@ def load_cookies(driver, cookies_file: str):
                 cookie["sameSite"] = "Lax"
             try:
                 driver.add_cookie(cookie)
-            except Exception:
-                pass
         print("[FB] Cookies loaded. Refreshing page...")
         driver.refresh()
         time.sleep(5)
@@ -363,33 +353,31 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
     print(f"[SCRAPE] Total unique posts: {total}")
     return posts
 def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
     driver = None
     try:
-        driver = new_driver(headless=True)
         wait = WebDriverWait(driver, 15)
         load_cookies(driver, cookies_file)
         posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
-        return posts, driver
     except Exception as e:
         print(f"[SCRAPE] Error in headless mode: {e}")
-        return [], None
     finally:
         if driver:
             try:
-                # Also try to clean up the temporary user data directory
-                user_data_dir = None
-                for arg in driver.options.arguments:
-                    if arg.startswith('--user-data-dir='):
-                        user_data_dir = arg.split('=', 1)[1]
-                        break
                 driver.quit()
-                if user_data_dir and os.path.exists(user_data_dir):
-                    import shutil
-                    shutil.rmtree(user_data_dir, ignore_errors=True)
             except Exception as e:
-                print(f"Error during driver cleanup: {e}")
 def main():
     args = get_args()
@@ -406,11 +394,11 @@ def main():
                 gemini_keys.append(key)
     gemini_manager = GeminiManager(gemini_keys) if gemini_keys else None
-    # This is not used to send mail, but just to check if auth is possible
     _ = build_gmail_service()
-    posts, driver = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
-    # Driver is cleaned up in try_scrape_with_fallback's finally block
     try:
         with open(args.out, "w", encoding="utf-8") as f:

+import os, re, sys, time, json, base64, pickle, argparse, traceback, shutil
 from typing import List, Dict, Any, Tuple
 from datetime import datetime
 import tempfile
 GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
 def build_gmail_service():
     if os.path.exists(SERVICE_ACCOUNT_FILE):
         try:
             sender_email = os.environ.get("SENDER_EMAIL")
             if not sender_email:
                 print("[GMAIL] SENDER_EMAIL environment variable not set.")
                 return None
             credentials = service_account.Credentials.from_service_account_file(
                 SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(sender_email)
+            return build("gmail", "v1", credentials=credentials)
         except Exception as e:
             print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
     return None
 def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
     if not service:
         print("[GMAIL] service not available; skipping email")
     hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
     return (len(hits) > 0, hits)
+# --- FIX: Return the user_data_dir for explicit cleanup ---
+def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
     options = webdriver.ChromeOptions()
     cache_path = os.path.join(WRITABLE_DIR, "selenium")
     os.makedirs(cache_path, exist_ok=True)
     os.environ["SE_CACHE_PATH"] = cache_path
     user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
     options.add_argument(f"--user-data-dir={user_data_dir}")
         })
     except Exception:
         pass
+    return driver, user_data_dir
 def load_cookies(driver, cookies_file: str):
     print("[FB] Loading Facebook homepage...")
                 cookie["sameSite"] = "Lax"
             try:
                 driver.add_cookie(cookie)
+            except Exception as e:
+                print(f"Could not add cookie: {e}")
         print("[FB] Cookies loaded. Refreshing page...")
         driver.refresh()
         time.sleep(5)
     print(f"[SCRAPE] Total unique posts: {total}")
     return posts
+# --- FIX: Robust cleanup of the driver and its user data directory ---
 def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
     driver = None
+    user_data_dir = None
+    posts = []
     try:
+        driver, user_data_dir = new_driver(headless=True)
         wait = WebDriverWait(driver, 15)
         load_cookies(driver, cookies_file)
         posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
     except Exception as e:
         print(f"[SCRAPE] Error in headless mode: {e}")
     finally:
         if driver:
             try:
                 driver.quit()
             except Exception as e:
+                print(f"Error during driver.quit(): {e}")
+        if user_data_dir and os.path.exists(user_data_dir):
+            try:
+                shutil.rmtree(user_data_dir, ignore_errors=True)
+                print(f"Cleaned up user data directory: {user_data_dir}")
+            except Exception as e:
+                print(f"Error cleaning up user data directory {user_data_dir}: {e}")
+    return posts
 def main():
     args = get_args()
                 gemini_keys.append(key)
     gemini_manager = GeminiManager(gemini_keys) if gemini_keys else None
+    # This is not used to send mail, just to confirm auth is possible.
     _ = build_gmail_service()
+    # Call the modified function which now returns only posts.
+    posts = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
     try:
         with open(args.out, "w", encoding="utf-8") as f: